inkmark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +940 -0
- data/Cargo.toml +27 -0
- data/LICENSE.txt +21 -0
- data/NOTICE +16 -0
- data/README.md +1166 -0
- data/ext/inkmark/Cargo.toml +31 -0
- data/ext/inkmark/build.rs +5 -0
- data/ext/inkmark/extconf.rb +6 -0
- data/ext/inkmark/src/autolink.rs +167 -0
- data/ext/inkmark/src/chunks_by_heading.rs +325 -0
- data/ext/inkmark/src/chunks_by_size.rs +302 -0
- data/ext/inkmark/src/document.rs +411 -0
- data/ext/inkmark/src/emoji.rs +197 -0
- data/ext/inkmark/src/handler.rs +758 -0
- data/ext/inkmark/src/heading.rs +262 -0
- data/ext/inkmark/src/highlight.rs +202 -0
- data/ext/inkmark/src/image.rs +284 -0
- data/ext/inkmark/src/lib.rs +54 -0
- data/ext/inkmark/src/link.rs +291 -0
- data/ext/inkmark/src/options.rs +231 -0
- data/ext/inkmark/src/plain_text.rs +445 -0
- data/ext/inkmark/src/scheme_filter.rs +319 -0
- data/ext/inkmark/src/stats.rs +453 -0
- data/ext/inkmark/src/tag_filter.rs +226 -0
- data/ext/inkmark/src/toc.rs +221 -0
- data/ext/inkmark/src/truncate.rs +267 -0
- data/ext/inkmark/src/url_match.rs +178 -0
- data/lib/inkmark/event.rb +342 -0
- data/lib/inkmark/native.rb +8 -0
- data/lib/inkmark/options.rb +698 -0
- data/lib/inkmark/toc.rb +40 -0
- data/lib/inkmark/version.rb +6 -0
- data/lib/inkmark.rb +711 -0
- data/sig/inkmark.rbs +219 -0
- metadata +208 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
use globset::{Glob, GlobSet, GlobSetBuilder};
|
|
2
|
+
use magnus::value::{Id, LazyId};
|
|
3
|
+
use magnus::{Error, RHash, Ruby};
|
|
4
|
+
use pulldown_cmark::Options;
|
|
5
|
+
|
|
6
|
+
use crate::stats::ExtractFlags;
|
|
7
|
+
|
|
8
|
+
// `sym_id!(ruby, "name")` resolves a Ruby option-key symbol through
|
|
9
|
+
// a block-scoped `static LazyId` cache. Each call site expands to
|
|
10
|
+
// its own static, so the intern happens exactly once per key over
|
|
11
|
+
// the process's lifetime; subsequent calls return the cached `Id`
|
|
12
|
+
// directly. Avoids the `ruby.to_symbol(key)` intern-table lookup
|
|
13
|
+
// that would otherwise run on every render for 25+ keys and kill
|
|
14
|
+
// performance.
|
|
15
|
+
macro_rules! sym_id {
|
|
16
|
+
($ruby:expr, $name:literal) => {{
|
|
17
|
+
static K: LazyId = LazyId::new($name);
|
|
18
|
+
LazyId::get_inner_with(&K, $ruby)
|
|
19
|
+
}};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/// Runtime flags that don't map to pulldown-cmark's `Options` bitflags but
|
|
23
|
+
/// instead drive Inkmark's own event filters (raw-HTML suppression, heading-id
|
|
24
|
+
/// generation, and future filters). Grouped into a struct so `build_options`
|
|
25
|
+
/// stays single-return as we add more filter knobs.
|
|
26
|
+
pub struct Flags {
|
|
27
|
+
pub suppress_raw_html: bool,
|
|
28
|
+
pub hard_wrap: bool,
|
|
29
|
+
pub gfm: bool,
|
|
30
|
+
pub gfm_tag_filter: bool,
|
|
31
|
+
pub heading_ids: bool,
|
|
32
|
+
pub emoji_shortcodes: bool,
|
|
33
|
+
pub autolink: bool,
|
|
34
|
+
pub lazy_images: bool,
|
|
35
|
+
pub nofollow_external_links: bool,
|
|
36
|
+
pub syntax_highlight: bool,
|
|
37
|
+
pub toc: bool,
|
|
38
|
+
pub toc_depth: Option<u8>,
|
|
39
|
+
pub statistics: bool,
|
|
40
|
+
// Extract-array flags, parsed from the nested `extract: {...}` hash.
|
|
41
|
+
// `ExtractFlags::any()` tells the renderer whether to take the
|
|
42
|
+
// single-pass stats/extract path.
|
|
43
|
+
pub extract: ExtractFlags,
|
|
44
|
+
// Compiled host-glob allowlists. `None` means the option was unset
|
|
45
|
+
// (no filtering); `Some(set)` means filter: `set` may be empty, in
|
|
46
|
+
// which case nothing matches and every external link/image is
|
|
47
|
+
// rejected.
|
|
48
|
+
pub allowed_link_hosts: Option<GlobSet>,
|
|
49
|
+
pub allowed_image_hosts: Option<GlobSet>,
|
|
50
|
+
// URL scheme allowlists for markdown-emitted links/images. `None`
|
|
51
|
+
// means the option is unset (filtering disabled—the Ruby-side
|
|
52
|
+
// default); `Some(list)` means filter. Stored as Vec rather than
|
|
53
|
+
// HashSet because realistic scheme lists are 2–5 entries, where a
|
|
54
|
+
// linear scan beats a hash table on cache alone.
|
|
55
|
+
pub allowed_link_schemes: Option<Vec<String>>,
|
|
56
|
+
pub allowed_image_schemes: Option<Vec<String>>,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
pub fn build_options(ruby: &Ruby, hash: RHash) -> Result<(Options, Flags), Error> {
|
|
60
|
+
let mut opts = Options::empty();
|
|
61
|
+
|
|
62
|
+
let get_bool = |id: Id| -> Result<bool, Error> {
|
|
63
|
+
let value: Option<bool> = hash.lookup(id)?;
|
|
64
|
+
Ok(value.unwrap_or(false))
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
// Pull each bool option once; "gfm" used to feed both `opts` and
|
|
68
|
+
// `flags` via a redundant second lookup—now read once, reused.
|
|
69
|
+
let gfm = get_bool(sym_id!(ruby, "gfm"))?;
|
|
70
|
+
let tables = get_bool(sym_id!(ruby, "tables"))?;
|
|
71
|
+
let strikethrough = get_bool(sym_id!(ruby, "strikethrough"))?;
|
|
72
|
+
let tasklists = get_bool(sym_id!(ruby, "tasklists"))?;
|
|
73
|
+
let footnotes = get_bool(sym_id!(ruby, "footnotes"))?;
|
|
74
|
+
let smart_punctuation = get_bool(sym_id!(ruby, "smart_punctuation"))?;
|
|
75
|
+
let heading_attributes = get_bool(sym_id!(ruby, "heading_attributes"))?;
|
|
76
|
+
let math = get_bool(sym_id!(ruby, "math"))?;
|
|
77
|
+
let definition_list = get_bool(sym_id!(ruby, "definition_list"))?;
|
|
78
|
+
let superscript = get_bool(sym_id!(ruby, "superscript"))?;
|
|
79
|
+
let subscript = get_bool(sym_id!(ruby, "subscript"))?;
|
|
80
|
+
let wikilinks = get_bool(sym_id!(ruby, "wikilinks"))?;
|
|
81
|
+
let frontmatter = get_bool(sym_id!(ruby, "frontmatter"))?;
|
|
82
|
+
|
|
83
|
+
if gfm {
|
|
84
|
+
opts.insert(Options::ENABLE_GFM);
|
|
85
|
+
}
|
|
86
|
+
if tables {
|
|
87
|
+
opts.insert(Options::ENABLE_TABLES);
|
|
88
|
+
}
|
|
89
|
+
if strikethrough {
|
|
90
|
+
opts.insert(Options::ENABLE_STRIKETHROUGH);
|
|
91
|
+
}
|
|
92
|
+
if tasklists {
|
|
93
|
+
opts.insert(Options::ENABLE_TASKLISTS);
|
|
94
|
+
}
|
|
95
|
+
if footnotes {
|
|
96
|
+
opts.insert(Options::ENABLE_FOOTNOTES);
|
|
97
|
+
}
|
|
98
|
+
if smart_punctuation {
|
|
99
|
+
opts.insert(Options::ENABLE_SMART_PUNCTUATION);
|
|
100
|
+
}
|
|
101
|
+
if heading_attributes {
|
|
102
|
+
opts.insert(Options::ENABLE_HEADING_ATTRIBUTES);
|
|
103
|
+
}
|
|
104
|
+
if math {
|
|
105
|
+
opts.insert(Options::ENABLE_MATH);
|
|
106
|
+
}
|
|
107
|
+
if definition_list {
|
|
108
|
+
opts.insert(Options::ENABLE_DEFINITION_LIST);
|
|
109
|
+
}
|
|
110
|
+
if superscript {
|
|
111
|
+
opts.insert(Options::ENABLE_SUPERSCRIPT);
|
|
112
|
+
}
|
|
113
|
+
if subscript {
|
|
114
|
+
opts.insert(Options::ENABLE_SUBSCRIPT);
|
|
115
|
+
}
|
|
116
|
+
if wikilinks {
|
|
117
|
+
opts.insert(Options::ENABLE_WIKILINKS);
|
|
118
|
+
}
|
|
119
|
+
if frontmatter {
|
|
120
|
+
opts.insert(Options::ENABLE_YAML_STYLE_METADATA_BLOCKS);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
let flags = Flags {
|
|
124
|
+
suppress_raw_html: !get_bool(sym_id!(ruby, "raw_html"))?,
|
|
125
|
+
hard_wrap: get_bool(sym_id!(ruby, "hard_wrap"))?,
|
|
126
|
+
gfm,
|
|
127
|
+
gfm_tag_filter: get_bool(sym_id!(ruby, "gfm_tag_filter"))?,
|
|
128
|
+
heading_ids: get_bool(sym_id!(ruby, "heading_ids"))?,
|
|
129
|
+
emoji_shortcodes: get_bool(sym_id!(ruby, "emoji_shortcodes"))?,
|
|
130
|
+
autolink: get_bool(sym_id!(ruby, "autolink"))?,
|
|
131
|
+
lazy_images: get_bool(sym_id!(ruby, "lazy_images"))?,
|
|
132
|
+
nofollow_external_links: get_bool(sym_id!(ruby, "nofollow_external_links"))?,
|
|
133
|
+
syntax_highlight: get_bool(sym_id!(ruby, "syntax_highlight"))?,
|
|
134
|
+
toc: get_bool(sym_id!(ruby, "toc"))?,
|
|
135
|
+
toc_depth: hash.lookup::<_, Option<u8>>(sym_id!(ruby, "toc_depth"))?,
|
|
136
|
+
statistics: get_bool(sym_id!(ruby, "statistics"))?,
|
|
137
|
+
extract: build_extract_flags(ruby, &hash)?,
|
|
138
|
+
allowed_link_hosts: build_host_globset(
|
|
139
|
+
ruby,
|
|
140
|
+
&hash,
|
|
141
|
+
sym_id!(ruby, "allowed_link_hosts"),
|
|
142
|
+
"allowed_link_hosts",
|
|
143
|
+
)?,
|
|
144
|
+
allowed_image_hosts: build_host_globset(
|
|
145
|
+
ruby,
|
|
146
|
+
&hash,
|
|
147
|
+
sym_id!(ruby, "allowed_image_hosts"),
|
|
148
|
+
"allowed_image_hosts",
|
|
149
|
+
)?,
|
|
150
|
+
allowed_link_schemes: build_scheme_set(&hash, sym_id!(ruby, "allowed_link_schemes"))?,
|
|
151
|
+
allowed_image_schemes: build_scheme_set(&hash, sym_id!(ruby, "allowed_image_schemes"))?,
|
|
152
|
+
};
|
|
153
|
+
Ok((opts, flags))
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/// Read an optional `Array<String>` option and compile it into a `GlobSet`.
|
|
157
|
+
/// Returns `Ok(None)` when the option is `nil` (the Ruby-side default) —
|
|
158
|
+
/// this signals "filtering disabled" to the event pipeline.
|
|
159
|
+
///
|
|
160
|
+
/// An empty array compiles to an empty `GlobSet` that matches nothing, so
|
|
161
|
+
/// `allowed_link_hosts: []` acts as a deny-all allowlist. Pattern compile
|
|
162
|
+
/// failures surface as a Ruby `ArgumentError` with the bad pattern quoted
|
|
163
|
+
/// so the user can find and fix it.
|
|
164
|
+
fn build_host_globset(
|
|
165
|
+
ruby: &Ruby,
|
|
166
|
+
hash: &RHash,
|
|
167
|
+
key_id: Id,
|
|
168
|
+
key_name: &str,
|
|
169
|
+
) -> Result<Option<GlobSet>, Error> {
|
|
170
|
+
let patterns: Option<Vec<String>> = hash.lookup(key_id)?;
|
|
171
|
+
let Some(patterns) = patterns else {
|
|
172
|
+
return Ok(None);
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
let mut builder = GlobSetBuilder::new();
|
|
176
|
+
for pattern in &patterns {
|
|
177
|
+
let glob = Glob::new(pattern).map_err(|e| {
|
|
178
|
+
Error::new(
|
|
179
|
+
ruby.exception_arg_error(),
|
|
180
|
+
format!("invalid glob pattern in {key_name}: {pattern:?}—{e}"),
|
|
181
|
+
)
|
|
182
|
+
})?;
|
|
183
|
+
builder.add(glob);
|
|
184
|
+
}
|
|
185
|
+
let set = builder.build().map_err(|e| {
|
|
186
|
+
Error::new(
|
|
187
|
+
ruby.exception_arg_error(),
|
|
188
|
+
format!("failed to compile {key_name} globset: {e}"),
|
|
189
|
+
)
|
|
190
|
+
})?;
|
|
191
|
+
Ok(Some(set))
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/// Read an optional `Array<String>` scheme allowlist and normalize to
|
|
195
|
+
/// lowercase. Returns `Ok(None)` when the option is `nil`, signalling
|
|
196
|
+
/// "filtering disabled" to the pipeline. An empty array compiles to an
|
|
197
|
+
/// empty `Vec` that matches nothing, which blocks every absolute URL
|
|
198
|
+
/// (relative URLs still pass through
|
|
199
|
+
/// [`crate::url_match::is_scheme_allowed`]).
|
|
200
|
+
fn build_scheme_set(hash: &RHash, key_id: Id) -> Result<Option<Vec<String>>, Error> {
|
|
201
|
+
let schemes: Option<Vec<String>> = hash.lookup(key_id)?;
|
|
202
|
+
Ok(schemes.map(|list| list.into_iter().map(|s| s.to_ascii_lowercase()).collect()))
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/// Read the nested `extract: { images: true, ... }` hash and compile to
|
|
206
|
+
/// an `ExtractFlags`. Nil / missing option → all flags off.
|
|
207
|
+
///
|
|
208
|
+
/// Ruby-side validation (`Inkmark::Options`) enforces the key set and
|
|
209
|
+
/// boolean value type, so by the time we get here an unknown key or
|
|
210
|
+
/// non-boolean value has already raised `ArgumentError`. We still read
|
|
211
|
+
/// defensively using `Option<bool>` + `unwrap_or(false)` so that a
|
|
212
|
+
/// missing sub-key is treated as "off".
|
|
213
|
+
fn build_extract_flags(ruby: &Ruby, hash: &RHash) -> Result<ExtractFlags, Error> {
|
|
214
|
+
let nested: Option<RHash> = hash.lookup(sym_id!(ruby, "extract"))?;
|
|
215
|
+
let Some(nested) = nested else {
|
|
216
|
+
return Ok(ExtractFlags::default());
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
let read = |id: Id| -> Result<bool, Error> {
|
|
220
|
+
let v: Option<bool> = nested.lookup(id)?;
|
|
221
|
+
Ok(v.unwrap_or(false))
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
Ok(ExtractFlags {
|
|
225
|
+
images: read(sym_id!(ruby, "images"))?,
|
|
226
|
+
links: read(sym_id!(ruby, "links"))?,
|
|
227
|
+
code_blocks: read(sym_id!(ruby, "code_blocks"))?,
|
|
228
|
+
headings: read(sym_id!(ruby, "headings"))?,
|
|
229
|
+
footnote_definitions: read(sym_id!(ruby, "footnote_definitions"))?,
|
|
230
|
+
})
|
|
231
|
+
}
|
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
//! Serialize pulldown-cmark events to plain text.
|
|
2
|
+
//!
|
|
3
|
+
//! Designed for embedding models, token counting, and any pipeline
|
|
4
|
+
//! where Markdown syntax is noise. Runs after the normal filter
|
|
5
|
+
//! pipeline (emoji replacement, autolink, host/scheme allowlists), so
|
|
6
|
+
//! the caller already sees resolved emoji, unwrapped disallowed links,
|
|
7
|
+
//! and so on.
|
|
8
|
+
//!
|
|
9
|
+
//! Core idea: **buffer stack**. Most writes go to the top-of-stack
|
|
10
|
+
//! buffer. Contexts that need post-processing (blockquote line
|
|
11
|
+
//! prefixing, link `text (url)` formatting, image alt capture,
|
|
12
|
+
//! footnote body capture) open a fresh buffer at the Start event and
|
|
13
|
+
//! pop + format at End. Nested contexts fall out for free because the
|
|
14
|
+
//! stack naturally tracks nesting depth.
|
|
15
|
+
|
|
16
|
+
use pulldown_cmark::{Event, Tag, TagEnd};
|
|
17
|
+
|
|
18
|
+
/// Write plain-text output into `buf` from a pulldown-cmark event stream.
|
|
19
|
+
pub fn write_plain_text<'a, I: IntoIterator<Item = Event<'a>>>(events: I, buf: &mut String) {
|
|
20
|
+
let mut w = Writer::new();
|
|
21
|
+
for event in events {
|
|
22
|
+
w.handle(event);
|
|
23
|
+
}
|
|
24
|
+
let out = w.finalize();
|
|
25
|
+
buf.push_str(&out);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
struct Writer {
|
|
29
|
+
/// Stack of write targets. Always non-empty; top is the current
|
|
30
|
+
/// target. `open()` pushes, `close()` pops.
|
|
31
|
+
buffers: Vec<String>,
|
|
32
|
+
list_stack: Vec<ListCtx>,
|
|
33
|
+
link_dest: String,
|
|
34
|
+
image_dest: String,
|
|
35
|
+
footnote_label: String,
|
|
36
|
+
/// Accumulated definitions, emitted at `finalize` in document order.
|
|
37
|
+
footnote_bodies: Vec<(String, String)>,
|
|
38
|
+
/// Current row's cells, tab-joined at TableRow/TableHead End.
|
|
39
|
+
current_row: Vec<String>,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
struct ListCtx {
|
|
43
|
+
ordered: bool,
|
|
44
|
+
counter: u64,
|
|
45
|
+
indent: usize,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
impl Writer {
|
|
49
|
+
fn new() -> Self {
|
|
50
|
+
Self {
|
|
51
|
+
buffers: vec![String::new()],
|
|
52
|
+
list_stack: Vec::new(),
|
|
53
|
+
link_dest: String::new(),
|
|
54
|
+
image_dest: String::new(),
|
|
55
|
+
footnote_label: String::new(),
|
|
56
|
+
footnote_bodies: Vec::new(),
|
|
57
|
+
current_row: Vec::new(),
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
fn write(&mut self, s: &str) {
|
|
62
|
+
self.buffers
|
|
63
|
+
.last_mut()
|
|
64
|
+
.expect("buffer stack is never empty")
|
|
65
|
+
.push_str(s);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
fn open(&mut self) {
|
|
69
|
+
self.buffers.push(String::new());
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
fn close(&mut self) -> String {
|
|
73
|
+
self.buffers.pop().expect("close() without matching open()")
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// Ensure the current buffer ends with exactly one blank line
|
|
77
|
+
/// (i.e. `"\n\n"`), except when the buffer is empty (no leading
|
|
78
|
+
/// newlines at document or subtree start).
|
|
79
|
+
fn ensure_blank_line(&mut self) {
|
|
80
|
+
let buf = self.buffers.last().expect("buffer stack is never empty");
|
|
81
|
+
if buf.is_empty() || buf.ends_with("\n\n") {
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
if buf.ends_with('\n') {
|
|
85
|
+
self.write("\n");
|
|
86
|
+
} else {
|
|
87
|
+
self.write("\n\n");
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/// Ensure the current buffer ends with `\n`. Used for transitions
|
|
92
|
+
/// that should just break the current line without introducing
|
|
93
|
+
/// paragraph-style separation (e.g. a nested list inside a list
|
|
94
|
+
/// item: `- outer\n - inner`, not a blank line between them).
|
|
95
|
+
fn ensure_newline(&mut self) {
|
|
96
|
+
let buf = self.buffers.last().expect("buffer stack is never empty");
|
|
97
|
+
if buf.is_empty() || buf.ends_with('\n') {
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
self.write("\n");
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
fn handle(&mut self, event: Event<'_>) {
|
|
104
|
+
match event {
|
|
105
|
+
Event::Start(tag) => self.start(tag),
|
|
106
|
+
Event::End(end) => self.end(end),
|
|
107
|
+
Event::Text(t) | Event::Code(t) => self.write(&t),
|
|
108
|
+
Event::SoftBreak => self.write(" "),
|
|
109
|
+
Event::HardBreak => self.write("\n"),
|
|
110
|
+
Event::Rule => {
|
|
111
|
+
self.ensure_blank_line();
|
|
112
|
+
self.write("---\n\n");
|
|
113
|
+
}
|
|
114
|
+
// Raw HTML reaches us only when raw_html: true (the
|
|
115
|
+
// suppress_raw_html filter rewrites it to Event::Text
|
|
116
|
+
// otherwise). Emit it verbatim to mirror the to_html /
|
|
117
|
+
// to_markdown contract.
|
|
118
|
+
Event::Html(h) | Event::InlineHtml(h) => self.write(&h),
|
|
119
|
+
Event::FootnoteReference(label) => {
|
|
120
|
+
self.write("[");
|
|
121
|
+
self.write(&label);
|
|
122
|
+
self.write("]");
|
|
123
|
+
}
|
|
124
|
+
// Task-list markers are dropped; the item bullet remains.
|
|
125
|
+
Event::TaskListMarker(_) => {}
|
|
126
|
+
Event::InlineMath(t) | Event::DisplayMath(t) => self.write(&t),
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
fn start(&mut self, tag: Tag<'_>) {
|
|
131
|
+
match tag {
|
|
132
|
+
Tag::Paragraph => {}
|
|
133
|
+
Tag::Heading { .. } => self.ensure_blank_line(),
|
|
134
|
+
Tag::BlockQuote(_) => {
|
|
135
|
+
self.ensure_blank_line();
|
|
136
|
+
self.open();
|
|
137
|
+
}
|
|
138
|
+
Tag::CodeBlock(_) => {
|
|
139
|
+
self.ensure_blank_line();
|
|
140
|
+
self.open();
|
|
141
|
+
}
|
|
142
|
+
Tag::List(first) => {
|
|
143
|
+
// Nested lists separate with a single newline (appear
|
|
144
|
+
// as the next line of their parent item); top-level
|
|
145
|
+
// lists get paragraph-style blank-line separation.
|
|
146
|
+
if self.list_stack.is_empty() {
|
|
147
|
+
self.ensure_blank_line();
|
|
148
|
+
} else {
|
|
149
|
+
self.ensure_newline();
|
|
150
|
+
}
|
|
151
|
+
let indent = self.list_stack.len() * 2;
|
|
152
|
+
self.list_stack.push(ListCtx {
|
|
153
|
+
ordered: first.is_some(),
|
|
154
|
+
counter: first.unwrap_or(1),
|
|
155
|
+
indent,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
Tag::Item => {
|
|
159
|
+
let ctx = self.list_stack.last_mut().expect("item outside list");
|
|
160
|
+
let indent = " ".repeat(ctx.indent);
|
|
161
|
+
let bullet = if ctx.ordered {
|
|
162
|
+
let n = ctx.counter;
|
|
163
|
+
ctx.counter += 1;
|
|
164
|
+
format!("{}. ", n)
|
|
165
|
+
} else {
|
|
166
|
+
"- ".to_string()
|
|
167
|
+
};
|
|
168
|
+
self.write(&indent);
|
|
169
|
+
self.write(&bullet);
|
|
170
|
+
}
|
|
171
|
+
Tag::Table(_) => self.ensure_blank_line(),
|
|
172
|
+
Tag::TableHead | Tag::TableRow => {}
|
|
173
|
+
Tag::TableCell => self.open(),
|
|
174
|
+
Tag::Link { dest_url, .. } => {
|
|
175
|
+
self.link_dest = dest_url.to_string();
|
|
176
|
+
self.open();
|
|
177
|
+
}
|
|
178
|
+
Tag::Image { dest_url, .. } => {
|
|
179
|
+
self.image_dest = dest_url.to_string();
|
|
180
|
+
self.open();
|
|
181
|
+
}
|
|
182
|
+
Tag::Emphasis | Tag::Strong | Tag::Strikethrough => {}
|
|
183
|
+
Tag::FootnoteDefinition(label) => {
|
|
184
|
+
self.footnote_label = label.to_string();
|
|
185
|
+
self.open();
|
|
186
|
+
}
|
|
187
|
+
// YAML metadata: buffer + discard on End so the raw
|
|
188
|
+
// frontmatter never reaches plain-text output (the Ruby
|
|
189
|
+
// side consumes it separately via `frontmatter`).
|
|
190
|
+
Tag::MetadataBlock(_) => self.open(),
|
|
191
|
+
// Pass-through structural tags—inner content writes to
|
|
192
|
+
// the current buffer unchanged.
|
|
193
|
+
Tag::HtmlBlock
|
|
194
|
+
| Tag::DefinitionList
|
|
195
|
+
| Tag::DefinitionListTitle
|
|
196
|
+
| Tag::DefinitionListDefinition
|
|
197
|
+
| Tag::Subscript
|
|
198
|
+
| Tag::Superscript => {}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
fn end(&mut self, end: TagEnd) {
|
|
203
|
+
match end {
|
|
204
|
+
TagEnd::Paragraph | TagEnd::Heading(_) => self.write("\n\n"),
|
|
205
|
+
TagEnd::BlockQuote(_) => {
|
|
206
|
+
let inner = self.close();
|
|
207
|
+
let prefixed = prefix_lines(inner.trim_end_matches('\n'), "> ");
|
|
208
|
+
self.write(&prefixed);
|
|
209
|
+
self.write("\n\n");
|
|
210
|
+
}
|
|
211
|
+
TagEnd::CodeBlock => {
|
|
212
|
+
let inner = self.close();
|
|
213
|
+
self.write(&inner);
|
|
214
|
+
self.ensure_blank_line();
|
|
215
|
+
}
|
|
216
|
+
TagEnd::List(_) => {
|
|
217
|
+
self.list_stack.pop();
|
|
218
|
+
// Only paragraph-separate after top-level lists; inside
|
|
219
|
+
// a parent item we're about to hit End(Item), which
|
|
220
|
+
// writes its own `\n`.
|
|
221
|
+
if self.list_stack.is_empty() {
|
|
222
|
+
self.ensure_blank_line();
|
|
223
|
+
} else {
|
|
224
|
+
self.ensure_newline();
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
TagEnd::Item => self.write("\n"),
|
|
228
|
+
TagEnd::Table => self.write("\n"),
|
|
229
|
+
TagEnd::TableHead => {
|
|
230
|
+
let row = std::mem::take(&mut self.current_row).join("\t");
|
|
231
|
+
self.write(&row);
|
|
232
|
+
// Blank line between header and body for readability.
|
|
233
|
+
self.write("\n\n");
|
|
234
|
+
}
|
|
235
|
+
TagEnd::TableRow => {
|
|
236
|
+
let row = std::mem::take(&mut self.current_row).join("\t");
|
|
237
|
+
self.write(&row);
|
|
238
|
+
self.write("\n");
|
|
239
|
+
}
|
|
240
|
+
TagEnd::TableCell => {
|
|
241
|
+
let cell = self.close();
|
|
242
|
+
self.current_row.push(cell);
|
|
243
|
+
}
|
|
244
|
+
TagEnd::Link => {
|
|
245
|
+
let text = self.close();
|
|
246
|
+
// Collapse when link text equals its URL (autolinks
|
|
247
|
+
// like `<https://x>` or linkify-produced links).
|
|
248
|
+
if text == self.link_dest {
|
|
249
|
+
self.write(&text);
|
|
250
|
+
} else {
|
|
251
|
+
self.write(&text);
|
|
252
|
+
self.write(" (");
|
|
253
|
+
let dest = std::mem::take(&mut self.link_dest);
|
|
254
|
+
self.write(&dest);
|
|
255
|
+
self.write(")");
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
TagEnd::Image => {
|
|
259
|
+
let alt = self.close();
|
|
260
|
+
self.write(&alt);
|
|
261
|
+
self.write(" (");
|
|
262
|
+
let dest = std::mem::take(&mut self.image_dest);
|
|
263
|
+
self.write(&dest);
|
|
264
|
+
self.write(")");
|
|
265
|
+
}
|
|
266
|
+
TagEnd::FootnoteDefinition => {
|
|
267
|
+
let body = self.close();
|
|
268
|
+
let label = std::mem::take(&mut self.footnote_label);
|
|
269
|
+
self.footnote_bodies.push((label, body.trim().to_string()));
|
|
270
|
+
}
|
|
271
|
+
TagEnd::MetadataBlock(_) => {
|
|
272
|
+
let _ = self.close();
|
|
273
|
+
}
|
|
274
|
+
_ => {}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
fn finalize(mut self) -> String {
|
|
279
|
+
if !self.footnote_bodies.is_empty() {
|
|
280
|
+
self.ensure_blank_line();
|
|
281
|
+
let defs = std::mem::take(&mut self.footnote_bodies);
|
|
282
|
+
for (i, (label, body)) in defs.iter().enumerate() {
|
|
283
|
+
if i > 0 {
|
|
284
|
+
self.write("\n");
|
|
285
|
+
}
|
|
286
|
+
self.write("[");
|
|
287
|
+
self.write(label);
|
|
288
|
+
self.write("]: ");
|
|
289
|
+
self.write(body);
|
|
290
|
+
}
|
|
291
|
+
self.write("\n");
|
|
292
|
+
}
|
|
293
|
+
let mut out = self.buffers.pop().expect("buffer stack is never empty");
|
|
294
|
+
// Trim trailing blank lines down to one final newline.
|
|
295
|
+
while out.ends_with("\n\n") {
|
|
296
|
+
out.pop();
|
|
297
|
+
}
|
|
298
|
+
if !out.is_empty() && !out.ends_with('\n') {
|
|
299
|
+
out.push('\n');
|
|
300
|
+
}
|
|
301
|
+
out
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/// Prefix every line of `s` with `prefix`. Empty lines receive the
|
|
306
|
+
/// prefix with its trailing whitespace stripped—so a `"> "` prefix
|
|
307
|
+
/// on a blank line produces `>`, matching email quoting convention.
|
|
308
|
+
fn prefix_lines(s: &str, prefix: &str) -> String {
|
|
309
|
+
let trimmed_prefix = prefix.trim_end();
|
|
310
|
+
let mut out = String::with_capacity(s.len() + prefix.len() * 4);
|
|
311
|
+
for (i, line) in s.split('\n').enumerate() {
|
|
312
|
+
if i > 0 {
|
|
313
|
+
out.push('\n');
|
|
314
|
+
}
|
|
315
|
+
if line.is_empty() {
|
|
316
|
+
out.push_str(trimmed_prefix);
|
|
317
|
+
} else {
|
|
318
|
+
out.push_str(prefix);
|
|
319
|
+
out.push_str(line);
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
out
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
#[cfg(test)]
|
|
326
|
+
mod tests {
|
|
327
|
+
use super::*;
|
|
328
|
+
use pulldown_cmark::{Options, Parser};
|
|
329
|
+
|
|
330
|
+
fn plain(md: &str) -> String {
|
|
331
|
+
let mut buf = String::new();
|
|
332
|
+
let opts = Options::ENABLE_GFM
|
|
333
|
+
| Options::ENABLE_TABLES
|
|
334
|
+
| Options::ENABLE_STRIKETHROUGH
|
|
335
|
+
| Options::ENABLE_TASKLISTS
|
|
336
|
+
| Options::ENABLE_FOOTNOTES;
|
|
337
|
+
write_plain_text(Parser::new_ext(md, opts), &mut buf);
|
|
338
|
+
buf
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
#[test]
|
|
342
|
+
fn paragraph_strips_emphasis() {
|
|
343
|
+
assert_eq!(
|
|
344
|
+
plain("**bold** and *italic* and ~~strike~~"),
|
|
345
|
+
"bold and italic and strike\n"
|
|
346
|
+
);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
#[test]
|
|
350
|
+
fn link_expands_to_text_with_url() {
|
|
351
|
+
assert_eq!(
|
|
352
|
+
plain("[example](https://example.net)"),
|
|
353
|
+
"example (https://example.net)\n"
|
|
354
|
+
);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
#[test]
|
|
358
|
+
fn autolink_collapses_text_equals_url() {
|
|
359
|
+
assert_eq!(plain("<https://example.net>"), "https://example.net\n");
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
#[test]
|
|
363
|
+
fn image_emits_alt_and_src() {
|
|
364
|
+
assert_eq!(plain(""), "cat (cat.png)\n");
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
#[test]
|
|
368
|
+
fn heading_is_plain_text_with_blank_line() {
|
|
369
|
+
assert_eq!(plain("# Title\n\nBody"), "Title\n\nBody\n");
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
#[test]
|
|
373
|
+
fn blockquote_prefixes_lines() {
|
|
374
|
+
let out = plain("> hello\n> world");
|
|
375
|
+
assert_eq!(out, "> hello world\n");
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
#[test]
|
|
379
|
+
fn nested_blockquote_double_prefix() {
|
|
380
|
+
let out = plain("> > nested");
|
|
381
|
+
assert_eq!(out, "> > nested\n");
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
#[test]
|
|
385
|
+
fn blockquote_with_blank_line_uses_bare_marker() {
|
|
386
|
+
let out = plain("> first\n>\n> second");
|
|
387
|
+
assert_eq!(out, "> first\n>\n> second\n");
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
#[test]
|
|
391
|
+
fn unordered_list_dash_bullet() {
|
|
392
|
+
assert_eq!(plain("- a\n- b"), "- a\n- b\n");
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
#[test]
|
|
396
|
+
fn ordered_list_numbers() {
|
|
397
|
+
assert_eq!(plain("1. first\n2. second"), "1. first\n2. second\n");
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
#[test]
|
|
401
|
+
fn nested_list_indented_two_spaces() {
|
|
402
|
+
let out = plain("- outer\n - inner");
|
|
403
|
+
assert_eq!(out, "- outer\n - inner\n");
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
#[test]
|
|
407
|
+
fn tasklist_drops_checkbox() {
|
|
408
|
+
assert_eq!(plain("- [x] done\n- [ ] todo"), "- done\n- todo\n");
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
#[test]
|
|
412
|
+
fn table_header_blank_line_then_body() {
|
|
413
|
+
let md = "| a | b |\n|---|---|\n| 1 | 2 |\n| 3 | 4 |";
|
|
414
|
+
let out = plain(md);
|
|
415
|
+
assert_eq!(out, "a\tb\n\n1\t2\n3\t4\n");
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
#[test]
|
|
419
|
+
fn code_block_preserved_verbatim() {
|
|
420
|
+
let out = plain("```ruby\nputs \"hi\"\n```");
|
|
421
|
+
assert_eq!(out, "puts \"hi\"\n");
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
#[test]
|
|
425
|
+
fn horizontal_rule_emits_dashes() {
|
|
426
|
+
assert_eq!(plain("before\n\n---\n\nafter"), "before\n\n---\n\nafter\n");
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
#[test]
|
|
430
|
+
fn footnote_reference_and_definition() {
|
|
431
|
+
let md = "See[^x].\n\n[^x]: body text";
|
|
432
|
+
let out = plain(md);
|
|
433
|
+
assert_eq!(out, "See[x].\n\n[x]: body text\n");
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
#[test]
|
|
437
|
+
fn inline_code_strips_backticks() {
|
|
438
|
+
assert_eq!(plain("use `puts` please"), "use puts please\n");
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
#[test]
|
|
442
|
+
fn hard_break_is_newline() {
|
|
443
|
+
assert_eq!(plain("line1 \nline2"), "line1\nline2\n");
|
|
444
|
+
}
|
|
445
|
+
}
|