inkmark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +940 -0
- data/Cargo.toml +27 -0
- data/LICENSE.txt +21 -0
- data/NOTICE +16 -0
- data/README.md +1166 -0
- data/ext/inkmark/Cargo.toml +31 -0
- data/ext/inkmark/build.rs +5 -0
- data/ext/inkmark/extconf.rb +6 -0
- data/ext/inkmark/src/autolink.rs +167 -0
- data/ext/inkmark/src/chunks_by_heading.rs +325 -0
- data/ext/inkmark/src/chunks_by_size.rs +302 -0
- data/ext/inkmark/src/document.rs +411 -0
- data/ext/inkmark/src/emoji.rs +197 -0
- data/ext/inkmark/src/handler.rs +758 -0
- data/ext/inkmark/src/heading.rs +262 -0
- data/ext/inkmark/src/highlight.rs +202 -0
- data/ext/inkmark/src/image.rs +284 -0
- data/ext/inkmark/src/lib.rs +54 -0
- data/ext/inkmark/src/link.rs +291 -0
- data/ext/inkmark/src/options.rs +231 -0
- data/ext/inkmark/src/plain_text.rs +445 -0
- data/ext/inkmark/src/scheme_filter.rs +319 -0
- data/ext/inkmark/src/stats.rs +453 -0
- data/ext/inkmark/src/tag_filter.rs +226 -0
- data/ext/inkmark/src/toc.rs +221 -0
- data/ext/inkmark/src/truncate.rs +267 -0
- data/ext/inkmark/src/url_match.rs +178 -0
- data/lib/inkmark/event.rb +342 -0
- data/lib/inkmark/native.rb +8 -0
- data/lib/inkmark/options.rb +698 -0
- data/lib/inkmark/toc.rb +40 -0
- data/lib/inkmark/version.rb +6 -0
- data/lib/inkmark.rb +711 -0
- data/sig/inkmark.rbs +219 -0
- metadata +208 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
use magnus::{Error, RHash, Ruby};
|
|
2
|
+
use pulldown_cmark::{html, Event, Options, Parser};
|
|
3
|
+
|
|
4
|
+
use crate::autolink;
|
|
5
|
+
use crate::emoji;
|
|
6
|
+
use crate::heading;
|
|
7
|
+
use crate::highlight;
|
|
8
|
+
use crate::image;
|
|
9
|
+
use crate::link;
|
|
10
|
+
use crate::options::{build_options, Flags};
|
|
11
|
+
use crate::plain_text;
|
|
12
|
+
use crate::scheme_filter::SchemeFilter;
|
|
13
|
+
use crate::stats;
|
|
14
|
+
use crate::tag_filter;
|
|
15
|
+
use crate::toc;
|
|
16
|
+
|
|
17
|
+
// When `opts_hash` is nil (Ruby passes nil), the caller signals that no
|
|
18
|
+
// options have been setβwe skip build_options entirely and use hardcoded
|
|
19
|
+
// defaults. This eliminates N hash lookups + N symbol creations per render.
|
|
20
|
+
pub fn native_to_html(
|
|
21
|
+
ruby: &Ruby,
|
|
22
|
+
source: String,
|
|
23
|
+
opts_hash: Option<RHash>,
|
|
24
|
+
) -> Result<String, Error> {
|
|
25
|
+
match opts_hash {
|
|
26
|
+
None => Ok(render_defaults(&source)),
|
|
27
|
+
Some(hash) => {
|
|
28
|
+
let (cm_opts, flags) = build_options(ruby, hash)?;
|
|
29
|
+
Ok(render(&source, cm_opts, flags))
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
pub fn native_to_markdown(
|
|
35
|
+
ruby: &Ruby,
|
|
36
|
+
source: String,
|
|
37
|
+
opts_hash: Option<RHash>,
|
|
38
|
+
) -> Result<String, Error> {
|
|
39
|
+
match opts_hash {
|
|
40
|
+
None => Ok(markdown_defaults(&source)),
|
|
41
|
+
Some(hash) => {
|
|
42
|
+
let (cm_opts, flags) = build_options(ruby, hash)?;
|
|
43
|
+
Ok(render_to_markdown(&source, cm_opts, flags))
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
pub fn native_to_plain_text(
|
|
49
|
+
ruby: &Ruby,
|
|
50
|
+
source: String,
|
|
51
|
+
opts_hash: Option<RHash>,
|
|
52
|
+
) -> Result<String, Error> {
|
|
53
|
+
match opts_hash {
|
|
54
|
+
None => Ok(plain_text_defaults(&source)),
|
|
55
|
+
Some(hash) => {
|
|
56
|
+
let (cm_opts, flags) = build_options(ruby, hash)?;
|
|
57
|
+
Ok(render_to_plain_text(&source, cm_opts, flags))
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/// Fast path. Hardcoded-defaults. Matches Inkmark::Options::DEFAULTS exactly:
|
|
63
|
+
/// GFM + tables + strikethrough + tasklists + footnotes on, raw HTML
|
|
64
|
+
/// suppressed, all allowlists off.
|
|
65
|
+
fn render_defaults(source: &str) -> String {
|
|
66
|
+
let mut buf = String::with_capacity(source.len() * 3 / 2);
|
|
67
|
+
let parser = Parser::new_ext(source, default_cm_opts());
|
|
68
|
+
let filtered = parser.map(suppress_raw_html);
|
|
69
|
+
html::push_html(&mut buf, filtered);
|
|
70
|
+
buf
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/// Same as render_defaults but serializes to Markdown instead of HTML.
|
|
74
|
+
fn markdown_defaults(source: &str) -> String {
|
|
75
|
+
let mut buf = String::with_capacity(source.len());
|
|
76
|
+
let parser = Parser::new_ext(source, default_cm_opts());
|
|
77
|
+
let filtered = parser.map(suppress_raw_html);
|
|
78
|
+
cmark_write(filtered, &mut buf);
|
|
79
|
+
buf
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/// Defaults-only plain-text fast path. Mirrors `markdown_defaults`:
|
|
83
|
+
/// same GFM baseline, same raw-HTML suppression.
|
|
84
|
+
fn plain_text_defaults(source: &str) -> String {
|
|
85
|
+
let mut buf = String::with_capacity(source.len());
|
|
86
|
+
let parser = Parser::new_ext(source, default_cm_opts());
|
|
87
|
+
let filtered = parser.map(suppress_raw_html);
|
|
88
|
+
plain_text::write_plain_text(filtered, &mut buf);
|
|
89
|
+
buf
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
fn default_cm_opts() -> Options {
|
|
93
|
+
let mut opts = Options::empty();
|
|
94
|
+
opts.insert(Options::ENABLE_GFM);
|
|
95
|
+
opts.insert(Options::ENABLE_TABLES);
|
|
96
|
+
opts.insert(Options::ENABLE_STRIKETHROUGH);
|
|
97
|
+
opts.insert(Options::ENABLE_TASKLISTS);
|
|
98
|
+
opts.insert(Options::ENABLE_FOOTNOTES);
|
|
99
|
+
opts
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
#[inline]
|
|
103
|
+
fn suppress_raw_html(event: Event) -> Event {
|
|
104
|
+
match event {
|
|
105
|
+
Event::Html(h) | Event::InlineHtml(h) => Event::Text(h),
|
|
106
|
+
other => other,
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
#[inline]
|
|
111
|
+
fn hard_wrap(event: Event) -> Event {
|
|
112
|
+
match event {
|
|
113
|
+
Event::SoftBreak => Event::HardBreak,
|
|
114
|
+
other => other,
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/// Full render: parse once, collect stats + TOC from original events,
|
|
119
|
+
/// apply filters, render HTML. Returns a Ruby Hash:
|
|
120
|
+
///
|
|
121
|
+
/// ```ruby
|
|
122
|
+
/// { html: "...", toc: "...", toc_html: "...", statistics: {...} }
|
|
123
|
+
/// ```
|
|
124
|
+
///
|
|
125
|
+
/// `statistics: true` implies full stats + TOC. `toc: true` alone gives
|
|
126
|
+
/// TOC + a lightweight stats hash (heading_count only). Keys whose
|
|
127
|
+
/// feature flag is off are set to nil.
|
|
128
|
+
pub fn native_render_full(ruby: &Ruby, source: String, opts_hash: RHash) -> Result<RHash, Error> {
|
|
129
|
+
let (cm_opts, mut flags) = build_options(ruby, opts_hash)?;
|
|
130
|
+
|
|
131
|
+
// statistics implies toc + heading_ids
|
|
132
|
+
if flags.statistics {
|
|
133
|
+
flags.toc = true;
|
|
134
|
+
flags.heading_ids = true;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Mutual toc / extract[:headings]: one walk powers both surfaces,
|
|
138
|
+
// so enabling either exposes the heading data on both. Keeps users
|
|
139
|
+
// from having to set two flags when the cost is identical.
|
|
140
|
+
if flags.extract.headings {
|
|
141
|
+
flags.toc = true;
|
|
142
|
+
}
|
|
143
|
+
if flags.toc {
|
|
144
|
+
flags.extract.headings = true;
|
|
145
|
+
flags.heading_ids = true;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Parse with offset iterator so stats::collect can attach byte
|
|
149
|
+
// ranges to each extract record. The filter pipeline only needs
|
|
150
|
+
// Event values, so we split the tuple into two vecs and drop the
|
|
151
|
+
// ranges before filters run.
|
|
152
|
+
let offset_events: Vec<(Event, std::ops::Range<usize>)> = Parser::new_ext(&source, cm_opts)
|
|
153
|
+
.into_offset_iter()
|
|
154
|
+
.collect();
|
|
155
|
+
|
|
156
|
+
// Collect stats/TOC from original events (before filters)
|
|
157
|
+
let collected = stats::collect(&offset_events);
|
|
158
|
+
|
|
159
|
+
// Strip ranges, apply filters, render HTML.
|
|
160
|
+
let events: Vec<Event> = offset_events.into_iter().map(|(e, _)| e).collect();
|
|
161
|
+
let events = apply_filters(events, &flags);
|
|
162
|
+
let mut buf = String::with_capacity(source.len() * 3 / 2);
|
|
163
|
+
html::push_html(&mut buf, events.into_iter());
|
|
164
|
+
|
|
165
|
+
// Build result hash
|
|
166
|
+
let result = ruby.hash_new();
|
|
167
|
+
result.aset(ruby.to_symbol("html"), buf)?;
|
|
168
|
+
|
|
169
|
+
if flags.toc {
|
|
170
|
+
let toc_md = toc::toc_to_markdown(&collected.toc_entries, flags.toc_depth);
|
|
171
|
+
let toc_html_str = toc::toc_to_html(&collected.toc_entries, flags.toc_depth);
|
|
172
|
+
result.aset(ruby.to_symbol("toc"), toc_md)?;
|
|
173
|
+
result.aset(ruby.to_symbol("toc_html"), toc_html_str)?;
|
|
174
|
+
} else {
|
|
175
|
+
result.aset(ruby.to_symbol("toc"), ())?;
|
|
176
|
+
result.aset(ruby.to_symbol("toc_html"), ())?;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
let stats_hash = stats::to_statistics_hash(ruby, &collected, flags.statistics)?;
|
|
180
|
+
result.aset(ruby.to_symbol("statistics"), stats_hash)?;
|
|
181
|
+
|
|
182
|
+
// Extracts: present when any extract flag is set (either directly,
|
|
183
|
+
// or implicitly via toc β headings). Nil otherwise so `md.extracts`
|
|
184
|
+
// returns nil for callers who didn't ask.
|
|
185
|
+
if flags.extract.any() {
|
|
186
|
+
let extracts_hash = stats::to_extracts_hash(ruby, &collected, flags.extract)?;
|
|
187
|
+
result.aset(ruby.to_symbol("extracts"), extracts_hash)?;
|
|
188
|
+
} else {
|
|
189
|
+
result.aset(ruby.to_symbol("extracts"), ())?;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Frontmatter: raw YAML text extracted from MetadataBlock events.
|
|
193
|
+
// Ruby side parses with YAML.safe_load.
|
|
194
|
+
match &collected.frontmatter {
|
|
195
|
+
Some(fm) => result.aset(ruby.to_symbol("frontmatter"), fm.as_str())?,
|
|
196
|
+
None => result.aset(ruby.to_symbol("frontmatter"), ())?,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
Ok(result)
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
fn render(source: &str, cm_opts: pulldown_cmark::Options, flags: Flags) -> String {
|
|
203
|
+
let mut buf = String::with_capacity(source.len() * 3 / 2);
|
|
204
|
+
let parser = Parser::new_ext(source, cm_opts);
|
|
205
|
+
|
|
206
|
+
// Fast path: no buffering filter is active. Stream events straight
|
|
207
|
+
// from the parser through push_html with at most one iterator-level
|
|
208
|
+
// map, zero Vec<Event> allocations. This is the hot path for the
|
|
209
|
+
// default config (only suppress_raw_html is on).
|
|
210
|
+
if !needs_buffer(&flags) {
|
|
211
|
+
html::push_html(&mut buf, parser.map(stream_filter(&flags)));
|
|
212
|
+
return buf;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
let events = apply_filters(parser.collect(), &flags);
|
|
216
|
+
html::push_html(&mut buf, events.into_iter());
|
|
217
|
+
buf
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
fn render_to_markdown(source: &str, cm_opts: pulldown_cmark::Options, flags: Flags) -> String {
|
|
221
|
+
let mut buf = String::with_capacity(source.len());
|
|
222
|
+
let parser = Parser::new_ext(source, cm_opts);
|
|
223
|
+
|
|
224
|
+
if !needs_buffer(&flags) {
|
|
225
|
+
cmark_write(parser.map(stream_filter(&flags)), &mut buf);
|
|
226
|
+
return buf;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
let events = apply_filters(parser.collect(), &flags);
|
|
230
|
+
cmark_write(events.into_iter(), &mut buf);
|
|
231
|
+
buf
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
fn render_to_plain_text(source: &str, cm_opts: pulldown_cmark::Options, flags: Flags) -> String {
|
|
235
|
+
let mut buf = String::with_capacity(source.len());
|
|
236
|
+
let parser = Parser::new_ext(source, cm_opts);
|
|
237
|
+
|
|
238
|
+
if !needs_buffer(&flags) {
|
|
239
|
+
plain_text::write_plain_text(parser.map(stream_filter(&flags)), &mut buf);
|
|
240
|
+
return buf;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
let events = apply_filters(parser.collect(), &flags);
|
|
244
|
+
plain_text::write_plain_text(events.into_iter(), &mut buf);
|
|
245
|
+
buf
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/// Fast-path event mapper. Combines the streaming filtersβ
|
|
249
|
+
/// `suppress_raw_html`, `hard_wrap`, and GFM tagfilterβinto one
|
|
250
|
+
/// closure so the three render entry points share one implementation.
|
|
251
|
+
/// Buffered filters (TOC, allowlists, etc.) go through `apply_filters`
|
|
252
|
+
/// instead.
|
|
253
|
+
fn stream_filter(flags: &Flags) -> impl Fn(Event) -> Event {
|
|
254
|
+
let shtml = flags.suppress_raw_html;
|
|
255
|
+
let hwrap = flags.hard_wrap;
|
|
256
|
+
|
|
257
|
+
// Tagfilter runs only when we're passing raw HTML through AND
|
|
258
|
+
// GFM is active AND the user hasn't opted out. Its output is
|
|
259
|
+
// otherwise wasted work (suppress_raw_html escapes everything).
|
|
260
|
+
let tagf = !flags.suppress_raw_html && flags.gfm && flags.gfm_tag_filter;
|
|
261
|
+
move |e| {
|
|
262
|
+
let e = if tagf { tag_filter::apply_event(e) } else { e };
|
|
263
|
+
let e = if shtml { suppress_raw_html(e) } else { e };
|
|
264
|
+
if hwrap {
|
|
265
|
+
hard_wrap(e)
|
|
266
|
+
} else {
|
|
267
|
+
e
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/// Returns true when any active filter requires materializing the event stream
|
|
273
|
+
/// into a Vec before processing. The fast path avoids this allocation entirely.
|
|
274
|
+
fn needs_buffer(flags: &Flags) -> bool {
|
|
275
|
+
flags.heading_ids
|
|
276
|
+
|| flags.emoji_shortcodes
|
|
277
|
+
|| flags.autolink
|
|
278
|
+
|| flags.lazy_images
|
|
279
|
+
|| flags.nofollow_external_links
|
|
280
|
+
|| flags.syntax_highlight
|
|
281
|
+
|| flags.allowed_link_hosts.is_some()
|
|
282
|
+
|| flags.allowed_image_hosts.is_some()
|
|
283
|
+
|| flags.allowed_link_schemes.is_some()
|
|
284
|
+
|| flags.allowed_image_schemes.is_some()
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/// Apply all active event-level filters to a materialized event Vec.
|
|
288
|
+
/// Shared by `render`, `render_to_markdown`, and `native_render_full`.
|
|
289
|
+
pub fn apply_filters<'a>(events: Vec<Event<'a>>, flags: &Flags) -> Vec<Event<'a>> {
|
|
290
|
+
let events = apply_pre_handler_filters(events, flags);
|
|
291
|
+
apply_post_handler_filters(events, flags)
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/// Enrichment filters that run before user handlers:
|
|
295
|
+
/// emoji => autolink => heading_ids => suppress_raw_html.
|
|
296
|
+
///
|
|
297
|
+
/// Handlers see emoji-resolved text, autolinked URLs, and heading IDs
|
|
298
|
+
/// already set. Code blocks are still Code events (not yet highlighted).
|
|
299
|
+
///
|
|
300
|
+
/// Order matters.
|
|
301
|
+
pub fn apply_pre_handler_filters<'a>(mut events: Vec<Event<'a>>, flags: &Flags) -> Vec<Event<'a>> {
|
|
302
|
+
// Emoji shortcodes run before heading IDs so a heading like
|
|
303
|
+
// `# :rocket: Launching` generates its slug from the rendered "π"
|
|
304
|
+
// rather than from the raw ":rocket:" text.
|
|
305
|
+
if flags.emoji_shortcodes {
|
|
306
|
+
emoji::replace(&mut events);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Autolink runs after emoji (so :rocket: is already a char, not a
|
|
310
|
+
// false-positive URL pattern) but before heading_ids (so heading
|
|
311
|
+
// text containing a URL gets that URL linked before the slug is
|
|
312
|
+
// computed). It emits Start(Link)/Text/End(Link), not Event::Html,
|
|
313
|
+
// so it can run before suppress_raw_html safely.
|
|
314
|
+
if flags.autolink {
|
|
315
|
+
events = autolink::autolink(events);
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if flags.heading_ids {
|
|
319
|
+
heading::add_ids(&mut events);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// GFM tagfilter: escape the nine disallowed tag names in raw HTML.
|
|
323
|
+
// Only runs when raw HTML is actually being passed throughβwhen
|
|
324
|
+
// suppress_raw_html is on, everything becomes escaped text anyway,
|
|
325
|
+
// and running tagfilter first would double-escape via Text events.
|
|
326
|
+
if !flags.suppress_raw_html && flags.gfm && flags.gfm_tag_filter {
|
|
327
|
+
for event in events.iter_mut() {
|
|
328
|
+
if matches!(event, Event::Html(_) | Event::InlineHtml(_)) {
|
|
329
|
+
let taken = std::mem::replace(event, Event::SoftBreak);
|
|
330
|
+
*event = tag_filter::apply_event(taken);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if flags.suppress_raw_html {
|
|
336
|
+
for event in events.iter_mut() {
|
|
337
|
+
match event {
|
|
338
|
+
Event::Html(_) | Event::InlineHtml(_) => {
|
|
339
|
+
let taken = std::mem::replace(event, Event::SoftBreak);
|
|
340
|
+
match taken {
|
|
341
|
+
Event::Html(h) | Event::InlineHtml(h) => *event = Event::Text(h),
|
|
342
|
+
_ => unreachable!(),
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
_ => {}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
if flags.hard_wrap {
|
|
351
|
+
for event in events.iter_mut() {
|
|
352
|
+
if matches!(event, Event::SoftBreak) {
|
|
353
|
+
*event = Event::HardBreak;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
events
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/// HTML-emitting filters that run after user handlers:
|
|
362
|
+
/// syntax_highlight => allowlists => lazy_images => nofollow.
|
|
363
|
+
///
|
|
364
|
+
/// Accepts `Vec<Event<'static>>` so it can be called on the owned events
|
|
365
|
+
/// produced by the handler tree after serialization.
|
|
366
|
+
///
|
|
367
|
+
/// Order matters.
|
|
368
|
+
pub fn apply_post_handler_filters<'a>(mut events: Vec<Event<'a>>, flags: &Flags) -> Vec<Event<'a>> {
|
|
369
|
+
// The filters below all synthesize Event::Html and must run after
|
|
370
|
+
// raw-HTML suppression (done in pre_handler_filters). Suppress_raw_html
|
|
371
|
+
// rewrites every Event::Html to Event::Text, which would HTML-escape
|
|
372
|
+
// our injected tags into visible angle brackets.
|
|
373
|
+
if flags.syntax_highlight {
|
|
374
|
+
events = highlight::highlight(events);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Host and scheme allowlists must run before the Html-emitting
|
|
378
|
+
// filters below, because those collapse Start/End(Link) and
|
|
379
|
+
// Start/End(Image) into single Event::Html eventsβafter which
|
|
380
|
+
// the allowlist can no longer see the dest_url on a structured
|
|
381
|
+
// Link/Image tag.
|
|
382
|
+
if let Some(set) = &flags.allowed_link_hosts {
|
|
383
|
+
events = link::filter_by_hosts(events, set);
|
|
384
|
+
}
|
|
385
|
+
if let Some(set) = &flags.allowed_image_hosts {
|
|
386
|
+
events = image::filter_by_hosts(events, set);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// Fuse both scheme filters into a single SchemeFilter passβhandles
|
|
390
|
+
// link and image events in one walk of the stream.
|
|
391
|
+
if flags.allowed_link_schemes.is_some() || flags.allowed_image_schemes.is_some() {
|
|
392
|
+
events = SchemeFilter::new(
|
|
393
|
+
events.into_iter(),
|
|
394
|
+
flags.allowed_link_schemes.as_deref(),
|
|
395
|
+
flags.allowed_image_schemes.as_deref(),
|
|
396
|
+
)
|
|
397
|
+
.collect();
|
|
398
|
+
}
|
|
399
|
+
if flags.lazy_images {
|
|
400
|
+
events = image::add_lazy_loading(events);
|
|
401
|
+
}
|
|
402
|
+
if flags.nofollow_external_links {
|
|
403
|
+
events = link::add_nofollow(events);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
events
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
fn cmark_write<'a, I: Iterator<Item = Event<'a>>>(events: I, buf: &mut String) {
|
|
410
|
+
pulldown_cmark_to_cmark::cmark(events, buf).expect("markdown serialization failed");
|
|
411
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
//! Emoji shortcode replacement filter.
|
|
2
|
+
//!
|
|
3
|
+
//! When enabled, walks the event stream and replaces gemoji-style
|
|
4
|
+
//! `:shortcode:` sequences in `Event::Text` payloads with the corresponding
|
|
5
|
+
//! emoji character. Lookups use the `emojis` crate's embedded gemoji
|
|
6
|
+
//! database.
|
|
7
|
+
//!
|
|
8
|
+
//! Shortcodes inside fenced code blocks are preserved. Inline code
|
|
9
|
+
//! spans (`Event::Code`) are also preserved because we only transform
|
|
10
|
+
//! `Event::Text` events. Unknown shortcodes are left as literal text.
|
|
11
|
+
|
|
12
|
+
use pulldown_cmark::{CowStr, Event, Tag, TagEnd};
|
|
13
|
+
|
|
14
|
+
/// Apply emoji shortcode replacement to a full event stream in place.
|
|
15
|
+
///
|
|
16
|
+
/// Tracks code-block nesting depth so shortcodes inside fenced code blocks
|
|
17
|
+
/// are preserved. Inline code (`Event::Code`) is passed through untouched
|
|
18
|
+
/// because we only scan `Event::Text` events.
|
|
19
|
+
pub fn replace(events: &mut Vec<Event<'_>>) {
|
|
20
|
+
let mut code_depth: usize = 0;
|
|
21
|
+
|
|
22
|
+
for i in 0..events.len() {
|
|
23
|
+
match &events[i] {
|
|
24
|
+
Event::Start(Tag::CodeBlock(_)) => {
|
|
25
|
+
code_depth += 1;
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
Event::End(TagEnd::CodeBlock) => {
|
|
29
|
+
code_depth = code_depth.saturating_sub(1);
|
|
30
|
+
continue;
|
|
31
|
+
}
|
|
32
|
+
Event::Text(_) if code_depth == 0 => {}
|
|
33
|
+
_ => continue,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Take ownership of the text so we can feed it to `replace_shortcodes`
|
|
37
|
+
// and emit a new Text event with the result.
|
|
38
|
+
if let Event::Text(text) = std::mem::replace(&mut events[i], Event::SoftBreak) {
|
|
39
|
+
match replace_shortcodes(&text) {
|
|
40
|
+
Some(replaced) => {
|
|
41
|
+
events[i] = Event::Text(CowStr::Boxed(replaced.into_boxed_str()));
|
|
42
|
+
}
|
|
43
|
+
None => {
|
|
44
|
+
events[i] = Event::Text(text);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Scan `text` for `:shortcode:` patterns and replace each match with its
|
|
52
|
+
/// emoji character. Returns `None` when no replacements were made so the
|
|
53
|
+
/// caller can skip rebuilding the event.
|
|
54
|
+
fn replace_shortcodes(text: &str) -> Option<String> {
|
|
55
|
+
// Fast path: if there's no colon at all, there's nothing to replace.
|
|
56
|
+
// This is the common case for most text runs.
|
|
57
|
+
if !text.contains(':') {
|
|
58
|
+
return None;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
let bytes = text.as_bytes();
|
|
62
|
+
let mut out = String::with_capacity(text.len());
|
|
63
|
+
// `last_emit` points at the first byte we haven't copied into `out` yet.
|
|
64
|
+
// `cursor` is the scanning position, which can run ahead of `last_emit`
|
|
65
|
+
// across unmatched `:` candidates without losing the intermediate text.
|
|
66
|
+
let mut last_emit = 0usize;
|
|
67
|
+
let mut cursor = 0usize;
|
|
68
|
+
let mut replaced_any = false;
|
|
69
|
+
|
|
70
|
+
while let Some(rel) = text[cursor..].find(':') {
|
|
71
|
+
let open = cursor + rel;
|
|
72
|
+
|
|
73
|
+
// Look for the closing colon on the same run. The shortcode body
|
|
74
|
+
// must be non-empty and only contain `[a-z0-9_+-]`. If we hit an
|
|
75
|
+
// invalid char before a closing colon, the whole range is not a
|
|
76
|
+
// shortcode and we continue scanning from just past this open colon.
|
|
77
|
+
let mut close = None;
|
|
78
|
+
let mut scan = open + 1;
|
|
79
|
+
while scan < bytes.len() {
|
|
80
|
+
let b = bytes[scan];
|
|
81
|
+
if b == b':' {
|
|
82
|
+
close = Some(scan);
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
let valid =
|
|
86
|
+
b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'_' || b == b'+' || b == b'-';
|
|
87
|
+
if !valid {
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
scan += 1;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if let Some(close_idx) = close {
|
|
94
|
+
if close_idx > open + 1 {
|
|
95
|
+
let name = &text[open + 1..close_idx];
|
|
96
|
+
if let Some(emoji) = emojis::get_by_shortcode(name) {
|
|
97
|
+
// Flush the literal run between the last emitted
|
|
98
|
+
// position and this match's open colon, then emit the
|
|
99
|
+
// emoji character in place of the full `:name:` span.
|
|
100
|
+
out.push_str(&text[last_emit..open]);
|
|
101
|
+
out.push_str(emoji.as_str());
|
|
102
|
+
last_emit = close_idx + 1;
|
|
103
|
+
cursor = close_idx + 1;
|
|
104
|
+
replaced_any = true;
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Not a match (no closing colon, empty name, invalid char, or
|
|
111
|
+
// unknown shortcode).
|
|
112
|
+
cursor = open + 1;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if !replaced_any {
|
|
116
|
+
return None;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Flush the tail after the last successful match.
|
|
120
|
+
out.push_str(&text[last_emit..]);
|
|
121
|
+
Some(out)
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
#[cfg(test)]
|
|
125
|
+
mod tests {
|
|
126
|
+
use super::{replace, replace_shortcodes};
|
|
127
|
+
use pulldown_cmark::{CowStr, Event};
|
|
128
|
+
|
|
129
|
+
#[test]
|
|
130
|
+
fn basic_replacement() {
|
|
131
|
+
assert_eq!(
|
|
132
|
+
replace_shortcodes("Ship it! :rocket:").as_deref(),
|
|
133
|
+
Some("Ship it! π")
|
|
134
|
+
);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
#[test]
|
|
138
|
+
fn multiple_in_one_string() {
|
|
139
|
+
assert_eq!(
|
|
140
|
+
replace_shortcodes(":tada: :rocket: :100:").as_deref(),
|
|
141
|
+
Some("π π π―")
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
#[test]
|
|
146
|
+
fn adjacent_shortcodes() {
|
|
147
|
+
assert_eq!(
|
|
148
|
+
replace_shortcodes(":rocket::tada:").as_deref(),
|
|
149
|
+
Some("ππ")
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn unknown_shortcode_left_as_is() {
|
|
155
|
+
assert_eq!(replace_shortcodes(":not_a_real_emoji:"), None);
|
|
156
|
+
assert_eq!(
|
|
157
|
+
replace_shortcodes(":rocket: and :not_a_real_emoji:").as_deref(),
|
|
158
|
+
Some("π and :not_a_real_emoji:")
|
|
159
|
+
);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
#[test]
|
|
163
|
+
fn fast_path_no_colon() {
|
|
164
|
+
assert_eq!(replace_shortcodes("nothing to see here"), None);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
#[test]
|
|
168
|
+
fn case_sensitive_lowercase_only() {
|
|
169
|
+
// gemoji shortcodes are canonical lowercaseβ:Rocket: doesn't match.
|
|
170
|
+
assert_eq!(replace_shortcodes(":Rocket:"), None);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#[test]
|
|
174
|
+
fn bare_colons_unchanged() {
|
|
175
|
+
assert_eq!(replace_shortcodes("8:00:00 am"), None);
|
|
176
|
+
assert_eq!(replace_shortcodes("foo:bar"), None);
|
|
177
|
+
assert_eq!(replace_shortcodes(":"), None);
|
|
178
|
+
assert_eq!(replace_shortcodes("::"), None);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[test]
|
|
182
|
+
fn hyphen_and_underscore_in_names() {
|
|
183
|
+
// gemoji uses both. `+1` / `-1` are valid thumbs-up/down.
|
|
184
|
+
assert_eq!(replace_shortcodes(":+1:").as_deref(), Some("π"));
|
|
185
|
+
assert_eq!(replace_shortcodes(":-1:").as_deref(), Some("π"));
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn replace_transforms_rocket_shortcode_in_event_stream() {
|
|
190
|
+
let mut events = vec![Event::Text(CowStr::Borrowed(":rocket:"))];
|
|
191
|
+
replace(&mut events);
|
|
192
|
+
match &events[0] {
|
|
193
|
+
Event::Text(t) => assert_eq!(t.as_ref(), "π"),
|
|
194
|
+
other => panic!("expected Text event, got {other:?}"),
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|