inkmark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +940 -0
- data/Cargo.toml +27 -0
- data/LICENSE.txt +21 -0
- data/NOTICE +16 -0
- data/README.md +1166 -0
- data/ext/inkmark/Cargo.toml +31 -0
- data/ext/inkmark/build.rs +5 -0
- data/ext/inkmark/extconf.rb +6 -0
- data/ext/inkmark/src/autolink.rs +167 -0
- data/ext/inkmark/src/chunks_by_heading.rs +325 -0
- data/ext/inkmark/src/chunks_by_size.rs +302 -0
- data/ext/inkmark/src/document.rs +411 -0
- data/ext/inkmark/src/emoji.rs +197 -0
- data/ext/inkmark/src/handler.rs +758 -0
- data/ext/inkmark/src/heading.rs +262 -0
- data/ext/inkmark/src/highlight.rs +202 -0
- data/ext/inkmark/src/image.rs +284 -0
- data/ext/inkmark/src/lib.rs +54 -0
- data/ext/inkmark/src/link.rs +291 -0
- data/ext/inkmark/src/options.rs +231 -0
- data/ext/inkmark/src/plain_text.rs +445 -0
- data/ext/inkmark/src/scheme_filter.rs +319 -0
- data/ext/inkmark/src/stats.rs +453 -0
- data/ext/inkmark/src/tag_filter.rs +226 -0
- data/ext/inkmark/src/toc.rs +221 -0
- data/ext/inkmark/src/truncate.rs +267 -0
- data/ext/inkmark/src/url_match.rs +178 -0
- data/lib/inkmark/event.rb +342 -0
- data/lib/inkmark/native.rb +8 -0
- data/lib/inkmark/options.rb +698 -0
- data/lib/inkmark/toc.rb +40 -0
- data/lib/inkmark/version.rb +6 -0
- data/lib/inkmark.rb +711 -0
- data/sig/inkmark.rbs +219 -0
- metadata +208 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
//! Table-of-contents rendering.
|
|
2
|
+
//!
|
|
3
|
+
//! Converts a slice of `TocEntry` values (produced by `stats::collect`)
|
|
4
|
+
//! into markdown or HTML strings.
|
|
5
|
+
|
|
6
|
+
use pulldown_cmark::HeadingLevel;
|
|
7
|
+
use pulldown_cmark_escape::escape_html;
|
|
8
|
+
|
|
9
|
+
pub struct TocEntry {
|
|
10
|
+
pub level: HeadingLevel,
|
|
11
|
+
pub text: String,
|
|
12
|
+
pub slug: String,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/// Render TOC entries as a markdown list.
|
|
16
|
+
///
|
|
17
|
+
/// Square brackets in heading text are escaped (`[` → `\[`, `]` → `\]`) so
|
|
18
|
+
/// they don't break the markdown link syntax `[text](#slug)`.
|
|
19
|
+
///
|
|
20
|
+
/// When `max_depth` is `Some(n)`, only headings at level ≤ n are included.
|
|
21
|
+
/// `None` means no depth filtering (every heading appears).
|
|
22
|
+
pub fn toc_to_markdown(entries: &[TocEntry], max_depth: Option<u8>) -> String {
|
|
23
|
+
let filtered: Vec<&TocEntry> = entries
|
|
24
|
+
.iter()
|
|
25
|
+
.filter(|e| max_depth.map_or(true, |max| level_to_u8(e.level) <= max))
|
|
26
|
+
.collect();
|
|
27
|
+
if filtered.is_empty() {
|
|
28
|
+
return String::new();
|
|
29
|
+
}
|
|
30
|
+
let min_level = filtered
|
|
31
|
+
.iter()
|
|
32
|
+
.map(|e| level_to_u8(e.level))
|
|
33
|
+
.min()
|
|
34
|
+
.unwrap_or(1);
|
|
35
|
+
|
|
36
|
+
let mut buf = String::new();
|
|
37
|
+
for entry in &filtered {
|
|
38
|
+
let indent = (level_to_u8(entry.level) - min_level) as usize * 2;
|
|
39
|
+
for _ in 0..indent {
|
|
40
|
+
buf.push(' ');
|
|
41
|
+
}
|
|
42
|
+
buf.push_str("- [");
|
|
43
|
+
// Escape [ and ] so they don't break the markdown link syntax.
|
|
44
|
+
for ch in entry.text.chars() {
|
|
45
|
+
match ch {
|
|
46
|
+
'[' => buf.push_str("\\["),
|
|
47
|
+
']' => buf.push_str("\\]"),
|
|
48
|
+
c => buf.push(c),
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
buf.push_str("](#");
|
|
52
|
+
buf.push_str(&entry.slug);
|
|
53
|
+
buf.push_str(")\n");
|
|
54
|
+
}
|
|
55
|
+
buf
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/// Render TOC entries as a nested HTML `<ul>` list.
|
|
59
|
+
///
|
|
60
|
+
/// When `max_depth` is `Some(n)`, only headings at level ≤ n are included.
|
|
61
|
+
pub fn toc_to_html(entries: &[TocEntry], max_depth: Option<u8>) -> String {
|
|
62
|
+
let filtered: Vec<&TocEntry> = entries
|
|
63
|
+
.iter()
|
|
64
|
+
.filter(|e| max_depth.map_or(true, |max| level_to_u8(e.level) <= max))
|
|
65
|
+
.collect();
|
|
66
|
+
if filtered.is_empty() {
|
|
67
|
+
return String::new();
|
|
68
|
+
}
|
|
69
|
+
let min_level = filtered
|
|
70
|
+
.iter()
|
|
71
|
+
.map(|e| level_to_u8(e.level))
|
|
72
|
+
.min()
|
|
73
|
+
.unwrap_or(1);
|
|
74
|
+
let mut buf = String::new();
|
|
75
|
+
let mut open_levels: Vec<u8> = Vec::new();
|
|
76
|
+
|
|
77
|
+
for entry in &filtered {
|
|
78
|
+
let lvl = level_to_u8(entry.level);
|
|
79
|
+
|
|
80
|
+
while open_levels
|
|
81
|
+
.last()
|
|
82
|
+
.copied()
|
|
83
|
+
.unwrap_or(min_level.saturating_sub(1))
|
|
84
|
+
>= lvl
|
|
85
|
+
{
|
|
86
|
+
buf.push_str("</li>\n</ul>\n");
|
|
87
|
+
open_levels.pop();
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
while open_levels
|
|
91
|
+
.last()
|
|
92
|
+
.copied()
|
|
93
|
+
.unwrap_or(min_level.saturating_sub(1))
|
|
94
|
+
< lvl
|
|
95
|
+
{
|
|
96
|
+
buf.push_str("<ul>\n");
|
|
97
|
+
open_levels.push(lvl);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
buf.push_str("<li><a href=\"#");
|
|
101
|
+
buf.push_str(&entry.slug);
|
|
102
|
+
buf.push_str("\">");
|
|
103
|
+
|
|
104
|
+
let _ = escape_html(&mut buf, &entry.text);
|
|
105
|
+
buf.push_str("</a>\n");
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
while open_levels.pop().is_some() {
|
|
109
|
+
buf.push_str("</li>\n</ul>\n");
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
buf
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
#[inline]
|
|
116
|
+
pub fn level_to_u8(level: HeadingLevel) -> u8 {
|
|
117
|
+
match level {
|
|
118
|
+
HeadingLevel::H1 => 1,
|
|
119
|
+
HeadingLevel::H2 => 2,
|
|
120
|
+
HeadingLevel::H3 => 3,
|
|
121
|
+
HeadingLevel::H4 => 4,
|
|
122
|
+
HeadingLevel::H5 => 5,
|
|
123
|
+
HeadingLevel::H6 => 6,
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#[cfg(test)]
|
|
128
|
+
mod tests {
|
|
129
|
+
use super::*;
|
|
130
|
+
use pulldown_cmark::HeadingLevel;
|
|
131
|
+
|
|
132
|
+
fn entry(level: HeadingLevel, text: &str, slug: &str) -> TocEntry {
|
|
133
|
+
TocEntry {
|
|
134
|
+
level,
|
|
135
|
+
text: text.to_string(),
|
|
136
|
+
slug: slug.to_string(),
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
#[test]
|
|
141
|
+
fn empty_toc_returns_empty_string() {
|
|
142
|
+
assert_eq!(toc_to_markdown(&[], None), "");
|
|
143
|
+
assert_eq!(toc_to_html(&[], None), "");
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
#[test]
|
|
147
|
+
fn markdown_escapes_brackets_in_heading_text() {
|
|
148
|
+
let entries = vec![entry(HeadingLevel::H1, "Arrays [1..n]", "arrays-1-n")];
|
|
149
|
+
let md = toc_to_markdown(&entries, None);
|
|
150
|
+
assert!(md.contains("\\[1..n\\]"), "brackets must be escaped: {md}");
|
|
151
|
+
assert!(md.contains("(#arrays-1-n)"));
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
#[test]
|
|
155
|
+
fn markdown_simple_toc() {
|
|
156
|
+
let entries = vec![
|
|
157
|
+
entry(HeadingLevel::H1, "Introduction", "introduction"),
|
|
158
|
+
entry(HeadingLevel::H2, "Background", "background"),
|
|
159
|
+
];
|
|
160
|
+
let md = toc_to_markdown(&entries, None);
|
|
161
|
+
assert_eq!(
|
|
162
|
+
md,
|
|
163
|
+
"- [Introduction](#introduction)\n - [Background](#background)\n"
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
#[test]
|
|
168
|
+
fn html_toc_escapes_text() {
|
|
169
|
+
let entries = vec![entry(HeadingLevel::H1, "A & B <C>", "a-b-c")];
|
|
170
|
+
let html = toc_to_html(&entries, None);
|
|
171
|
+
assert!(html.contains("A & B <C>"));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
#[test]
|
|
175
|
+
fn max_depth_filters_markdown() {
|
|
176
|
+
let entries = vec![
|
|
177
|
+
entry(HeadingLevel::H1, "Top", "top"),
|
|
178
|
+
entry(HeadingLevel::H2, "Mid", "mid"),
|
|
179
|
+
entry(HeadingLevel::H3, "Deep", "deep"),
|
|
180
|
+
];
|
|
181
|
+
let md = toc_to_markdown(&entries, Some(2));
|
|
182
|
+
assert!(md.contains("[Top]"));
|
|
183
|
+
assert!(md.contains("[Mid]"));
|
|
184
|
+
assert!(!md.contains("[Deep]"));
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
#[test]
|
|
188
|
+
fn max_depth_filters_html() {
|
|
189
|
+
let entries = vec![
|
|
190
|
+
entry(HeadingLevel::H1, "Top", "top"),
|
|
191
|
+
entry(HeadingLevel::H2, "Mid", "mid"),
|
|
192
|
+
entry(HeadingLevel::H3, "Deep", "deep"),
|
|
193
|
+
];
|
|
194
|
+
let html = toc_to_html(&entries, Some(2));
|
|
195
|
+
assert!(html.contains(">Top</a>"));
|
|
196
|
+
assert!(html.contains(">Mid</a>"));
|
|
197
|
+
assert!(!html.contains(">Deep</a>"));
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
#[test]
|
|
201
|
+
fn max_depth_one_keeps_only_h1() {
|
|
202
|
+
let entries = vec![
|
|
203
|
+
entry(HeadingLevel::H1, "Top", "top"),
|
|
204
|
+
entry(HeadingLevel::H2, "Mid", "mid"),
|
|
205
|
+
];
|
|
206
|
+
let md = toc_to_markdown(&entries, Some(1));
|
|
207
|
+
assert_eq!(md, "- [Top](#top)\n");
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
#[test]
|
|
211
|
+
fn max_depth_respects_remaining_entries_for_min_level() {
|
|
212
|
+
// With h3 filtered out, min_level is h1, so h2 gets 2-space indent.
|
|
213
|
+
let entries = vec![
|
|
214
|
+
entry(HeadingLevel::H1, "Top", "top"),
|
|
215
|
+
entry(HeadingLevel::H2, "Mid", "mid"),
|
|
216
|
+
entry(HeadingLevel::H3, "Deep", "deep"),
|
|
217
|
+
];
|
|
218
|
+
let md = toc_to_markdown(&entries, Some(2));
|
|
219
|
+
assert_eq!(md, "- [Top](#top)\n - [Mid](#mid)\n");
|
|
220
|
+
}
|
|
221
|
+
}
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
//! Markdown truncation for LLM / RAG pipelines.
|
|
2
|
+
//!
|
|
3
|
+
//! Cuts a filter-applied event stream at either a block boundary
|
|
4
|
+
//! (`TruncateAt::Block`) or a Unicode word boundary
|
|
5
|
+
//! (`TruncateAt::Word`), respecting optional character and word
|
|
6
|
+
//! budgets. Designed as a first-stage preprocessor for embedding
|
|
7
|
+
//! input, context-window budgeting, and chunk normalization.
|
|
8
|
+
|
|
9
|
+
use magnus::{Error, RHash, Ruby};
|
|
10
|
+
use pulldown_cmark::{Event, Parser};
|
|
11
|
+
use unicode_segmentation::UnicodeSegmentation;
|
|
12
|
+
|
|
13
|
+
use crate::document::apply_filters;
|
|
14
|
+
use crate::options::{build_options, Flags};
|
|
15
|
+
|
|
16
|
+
/// What kind of boundary to cut at.
|
|
17
|
+
#[derive(Clone, Copy, PartialEq, Eq)]
|
|
18
|
+
pub enum TruncateAt {
|
|
19
|
+
/// Last top-level Markdown block that fits the budget. Output is
|
|
20
|
+
/// always valid Markdown.
|
|
21
|
+
Block,
|
|
22
|
+
/// Last Unicode word boundary that fits the budget. Output is a
|
|
23
|
+
/// Markdown string but may split an open construct (code fence,
|
|
24
|
+
/// link, emphasis).
|
|
25
|
+
Word,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/// Parameters for a truncation pass. At least one of `chars` / `words`
|
|
29
|
+
/// must be `Some`; if both are set, cut when the first of the two
|
|
30
|
+
/// budgets is exhausted. `marker` ("...") counts toward the budget:
|
|
31
|
+
/// if supplied, the effective content budget is reduced so that final
|
|
32
|
+
/// output length stays at or under the user-given limit.
|
|
33
|
+
pub struct TruncateParams {
|
|
34
|
+
pub chars: Option<usize>,
|
|
35
|
+
pub words: Option<usize>,
|
|
36
|
+
pub at: TruncateAt,
|
|
37
|
+
pub marker: Option<String>,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/// Full-document entry point: parse + filter + truncate.
|
|
41
|
+
pub fn truncate_source(
|
|
42
|
+
source: &str,
|
|
43
|
+
cm_opts: pulldown_cmark::Options,
|
|
44
|
+
flags: &Flags,
|
|
45
|
+
params: &TruncateParams,
|
|
46
|
+
) -> String {
|
|
47
|
+
let events: Vec<Event> = Parser::new_ext(source, cm_opts).collect();
|
|
48
|
+
let events = apply_filters(events, flags);
|
|
49
|
+
truncate_events(&events, params)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/// Ruby-facing entry point. Expects `params_hash` to contain
|
|
53
|
+
/// `:chars`, `:words`, `:at` (`:block` | `:word`), and `:marker`
|
|
54
|
+
/// (a String or nil). Argument validation lives on the Ruby side;
|
|
55
|
+
/// we just read the values defensively here.
|
|
56
|
+
pub fn native_truncate_markdown(
|
|
57
|
+
ruby: &Ruby,
|
|
58
|
+
source: String,
|
|
59
|
+
params_hash: RHash,
|
|
60
|
+
opts_hash: RHash,
|
|
61
|
+
) -> Result<String, Error> {
|
|
62
|
+
let (cm_opts, flags) = build_options(ruby, opts_hash)?;
|
|
63
|
+
let params = parse_params(ruby, params_hash)?;
|
|
64
|
+
Ok(truncate_source(&source, cm_opts, &flags, ¶ms))
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
pub fn parse_params(ruby: &Ruby, hash: RHash) -> Result<TruncateParams, Error> {
|
|
68
|
+
let chars: Option<usize> = hash.lookup(ruby.to_symbol("chars"))?;
|
|
69
|
+
let words: Option<usize> = hash.lookup(ruby.to_symbol("words"))?;
|
|
70
|
+
let at_sym: Option<String> = hash.lookup(ruby.to_symbol("at"))?;
|
|
71
|
+
let marker: Option<String> = hash.lookup(ruby.to_symbol("marker"))?;
|
|
72
|
+
|
|
73
|
+
let at = match at_sym.as_deref() {
|
|
74
|
+
Some("block") => TruncateAt::Block,
|
|
75
|
+
Some("word") => TruncateAt::Word,
|
|
76
|
+
_ => TruncateAt::Block,
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
Ok(TruncateParams {
|
|
80
|
+
chars,
|
|
81
|
+
words,
|
|
82
|
+
at,
|
|
83
|
+
marker,
|
|
84
|
+
})
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/// Core truncation over a filter-applied event slice.
|
|
88
|
+
pub fn truncate_events(events: &[Event<'_>], params: &TruncateParams) -> String {
|
|
89
|
+
match params.at {
|
|
90
|
+
TruncateAt::Block => truncate_at_block(events, params),
|
|
91
|
+
TruncateAt::Word => truncate_at_word(events, params),
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
fn truncate_at_block(events: &[Event<'_>], params: &TruncateParams) -> String {
|
|
96
|
+
let marker_chars = marker_chars(¶ms.marker);
|
|
97
|
+
let marker_words = marker_words(¶ms.marker);
|
|
98
|
+
let char_budget = params.chars.map(|n| n.saturating_sub(marker_chars));
|
|
99
|
+
let word_budget = params.words.map(|n| n.saturating_sub(marker_words));
|
|
100
|
+
|
|
101
|
+
let mut kept = String::new();
|
|
102
|
+
let mut used_chars: usize = 0;
|
|
103
|
+
let mut used_words: usize = 0;
|
|
104
|
+
let mut any_dropped = false;
|
|
105
|
+
|
|
106
|
+
for (start, end) in top_level_blocks(events) {
|
|
107
|
+
let block = render_markdown(&events[start..=end]);
|
|
108
|
+
let block_chars = block.trim_end().chars().count();
|
|
109
|
+
let block_words = block.unicode_words().count();
|
|
110
|
+
|
|
111
|
+
let would_exceed_chars = char_budget
|
|
112
|
+
.map(|b| used_chars + block_chars > b)
|
|
113
|
+
.unwrap_or(false);
|
|
114
|
+
let would_exceed_words = word_budget
|
|
115
|
+
.map(|b| used_words + block_words > b)
|
|
116
|
+
.unwrap_or(false);
|
|
117
|
+
|
|
118
|
+
if would_exceed_chars || would_exceed_words {
|
|
119
|
+
any_dropped = true;
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
kept.push_str(&block);
|
|
124
|
+
used_chars += block_chars;
|
|
125
|
+
used_words += block_words;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// If nothing was dropped and source fit, return unchanged (no marker).
|
|
129
|
+
if !any_dropped {
|
|
130
|
+
return kept;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// First block alone exceeded the budget; honest empty return
|
|
134
|
+
if kept.is_empty() {
|
|
135
|
+
return String::new();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Trim trailing whitespace before appending the marker so the
|
|
139
|
+
// ellipsis attaches cleanly to the last block.
|
|
140
|
+
while kept.ends_with(|c: char| c.is_whitespace()) {
|
|
141
|
+
kept.pop();
|
|
142
|
+
}
|
|
143
|
+
if let Some(marker) = ¶ms.marker {
|
|
144
|
+
kept.push_str("\n\n");
|
|
145
|
+
kept.push_str(marker);
|
|
146
|
+
}
|
|
147
|
+
kept.push('\n');
|
|
148
|
+
kept
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/// Return (start, end) event-index pairs for top-level blocks.
|
|
152
|
+
/// A block starts at an `Event::Start` with depth 0 and ends at the
|
|
153
|
+
/// matching `Event::End`. Leaf events at depth 0 (Rule, HardBreak if
|
|
154
|
+
/// it ever happens, raw Html block) count as single-event blocks.
|
|
155
|
+
fn top_level_blocks(events: &[Event<'_>]) -> Vec<(usize, usize)> {
|
|
156
|
+
let mut blocks = Vec::new();
|
|
157
|
+
let mut depth: i32 = 0;
|
|
158
|
+
let mut current_start: Option<usize> = None;
|
|
159
|
+
|
|
160
|
+
for (i, event) in events.iter().enumerate() {
|
|
161
|
+
match event {
|
|
162
|
+
Event::Start(_) => {
|
|
163
|
+
if depth == 0 {
|
|
164
|
+
current_start = Some(i);
|
|
165
|
+
}
|
|
166
|
+
depth += 1;
|
|
167
|
+
}
|
|
168
|
+
Event::End(_) => {
|
|
169
|
+
depth -= 1;
|
|
170
|
+
if depth == 0 {
|
|
171
|
+
if let Some(start) = current_start.take() {
|
|
172
|
+
blocks.push((start, i));
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
_ => {
|
|
177
|
+
if depth == 0 {
|
|
178
|
+
// Standalone top-level event (Event::Rule, etc.).
|
|
179
|
+
blocks.push((i, i));
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
blocks
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
fn render_markdown(events: &[Event<'_>]) -> String {
|
|
189
|
+
let mut buf = String::new();
|
|
190
|
+
pulldown_cmark_to_cmark::cmark(events.iter().cloned(), &mut buf)
|
|
191
|
+
.expect("markdown serialization failed");
|
|
192
|
+
buf
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
fn truncate_at_word(events: &[Event<'_>], params: &TruncateParams) -> String {
|
|
196
|
+
let rendered = render_markdown(events);
|
|
197
|
+
let marker_chars = marker_chars(¶ms.marker);
|
|
198
|
+
let marker_words = marker_words(¶ms.marker);
|
|
199
|
+
|
|
200
|
+
let total_chars = rendered.chars().count();
|
|
201
|
+
let total_words = rendered.unicode_words().count();
|
|
202
|
+
|
|
203
|
+
let chars_ok = params
|
|
204
|
+
.chars
|
|
205
|
+
.map(|limit| total_chars + marker_chars <= limit)
|
|
206
|
+
.unwrap_or(true);
|
|
207
|
+
let words_ok = params
|
|
208
|
+
.words
|
|
209
|
+
.map(|limit| total_words + marker_words <= limit)
|
|
210
|
+
.unwrap_or(true);
|
|
211
|
+
|
|
212
|
+
// Fits under both budgets: return unchanged, no marker.
|
|
213
|
+
if chars_ok && words_ok {
|
|
214
|
+
return rendered;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
let char_budget = params.chars.map(|n| n.saturating_sub(marker_chars));
|
|
218
|
+
let word_budget = params.words.map(|n| n.saturating_sub(marker_words));
|
|
219
|
+
|
|
220
|
+
// Walk word boundaries, tracking cumulative char and word counts.
|
|
221
|
+
// `last_good_end` is the byte offset of the end of the last word
|
|
222
|
+
// segment that stays within both budgets.
|
|
223
|
+
let mut last_good_end: usize = 0;
|
|
224
|
+
let mut used_chars: usize = 0;
|
|
225
|
+
let mut used_words: usize = 0;
|
|
226
|
+
|
|
227
|
+
for (offset, segment) in rendered.split_word_bound_indices() {
|
|
228
|
+
let seg_chars = segment.chars().count();
|
|
229
|
+
let seg_is_word = segment.unicode_words().next().is_some();
|
|
230
|
+
let next_words = if seg_is_word {
|
|
231
|
+
used_words + 1
|
|
232
|
+
} else {
|
|
233
|
+
used_words
|
|
234
|
+
};
|
|
235
|
+
let next_chars = used_chars + seg_chars;
|
|
236
|
+
|
|
237
|
+
let over_chars = char_budget.map(|b| next_chars > b).unwrap_or(false);
|
|
238
|
+
let over_words = word_budget.map(|b| next_words > b).unwrap_or(false);
|
|
239
|
+
if over_chars || over_words {
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
used_chars = next_chars;
|
|
244
|
+
used_words = next_words;
|
|
245
|
+
last_good_end = offset + segment.len();
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
let mut out = rendered[..last_good_end].to_string();
|
|
249
|
+
while out.ends_with(|c: char| c.is_whitespace()) {
|
|
250
|
+
out.pop();
|
|
251
|
+
}
|
|
252
|
+
if let Some(marker) = ¶ms.marker {
|
|
253
|
+
out.push_str(marker);
|
|
254
|
+
}
|
|
255
|
+
out
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
fn marker_chars(marker: &Option<String>) -> usize {
|
|
259
|
+
marker.as_ref().map(|m| m.chars().count()).unwrap_or(0)
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
fn marker_words(marker: &Option<String>) -> usize {
|
|
263
|
+
marker
|
|
264
|
+
.as_ref()
|
|
265
|
+
.map(|m| m.unicode_words().count())
|
|
266
|
+
.unwrap_or(0)
|
|
267
|
+
}
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
//! URL-allowlist matching shared between the link and image filters.
|
|
2
|
+
//!
|
|
3
|
+
//! Two entry points, both using `url::Url::parse` as the single source
|
|
4
|
+
//! of truth for URL decomposition:
|
|
5
|
+
//!
|
|
6
|
+
//! - [`is_host_allowed`]: match the URL's host against a glob allowlist.
|
|
7
|
+
//! - [`is_scheme_allowed`]: match the URL's scheme against a string set.
|
|
8
|
+
//!
|
|
9
|
+
//! Both fail-open on URLs the parser can't resolve (relative paths,
|
|
10
|
+
//! anchors, protocol-relative, malformed input): such URLs have nothing
|
|
11
|
+
//! to check against, so they pass through unchanged.
|
|
12
|
+
|
|
13
|
+
use globset::GlobSet;
|
|
14
|
+
|
|
15
|
+
/// Return true when the URL should be kept by a host allowlist.
|
|
16
|
+
///
|
|
17
|
+
/// - **Has a host** (http/https URLs): lowercase and match against `set`.
|
|
18
|
+
/// - **No host** (relative `/foo`, anchor `#x`, `mailto:`, `tel:`,
|
|
19
|
+
/// `javascript:`, malformed input): out of scope for host allowlisting,
|
|
20
|
+
/// return true so the caller leaves it alone.
|
|
21
|
+
///
|
|
22
|
+
/// Fail-open on parse failure is deliberate: relative URLs must pass
|
|
23
|
+
/// through unchanged, and "can't parse" is how `url` signals that.
|
|
24
|
+
pub fn is_host_allowed(url: &str, set: &GlobSet) -> bool {
|
|
25
|
+
match url::Url::parse(url)
|
|
26
|
+
.ok()
|
|
27
|
+
.and_then(|u| u.host_str().map(str::to_ascii_lowercase))
|
|
28
|
+
{
|
|
29
|
+
Some(host) => set.is_match(&host),
|
|
30
|
+
None => true,
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/// Return true when the URL's scheme is in the allowlist.
|
|
35
|
+
///
|
|
36
|
+
/// - **Parses as absolute URL**: check the scheme (lowercased by `url::Url`)
|
|
37
|
+
/// against `allowed`.
|
|
38
|
+
/// - **Doesn't parse** (relative `/foo`, anchor `#x`, protocol-relative
|
|
39
|
+
/// `//host/x`, malformed input): no scheme to check, return true.
|
|
40
|
+
///
|
|
41
|
+
/// `allowed` is a `&[String]` rather than a `HashSet` because every
|
|
42
|
+
/// realistic scheme allowlist has 2–5 entries, a linear scan of short
|
|
43
|
+
/// strings beats any hash table on CPU cache alone at that size. The
|
|
44
|
+
/// caller must pre-lowercase entries; `url::Url` normalizes schemes to
|
|
45
|
+
/// lowercase at parse time, so comparing against lowercase entries is
|
|
46
|
+
/// correct.
|
|
47
|
+
pub fn is_scheme_allowed(url: &str, allowed: &[String]) -> bool {
|
|
48
|
+
match url::Url::parse(url) {
|
|
49
|
+
Ok(parsed) => {
|
|
50
|
+
let scheme = parsed.scheme();
|
|
51
|
+
allowed.iter().any(|s| s == scheme)
|
|
52
|
+
}
|
|
53
|
+
Err(_) => true,
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
#[cfg(test)]
|
|
58
|
+
mod tests {
|
|
59
|
+
use super::{is_host_allowed, is_scheme_allowed};
|
|
60
|
+
use globset::{Glob, GlobSetBuilder};
|
|
61
|
+
|
|
62
|
+
fn host_set(patterns: &[&str]) -> globset::GlobSet {
|
|
63
|
+
let mut b = GlobSetBuilder::new();
|
|
64
|
+
for p in patterns {
|
|
65
|
+
b.add(Glob::new(p).unwrap());
|
|
66
|
+
}
|
|
67
|
+
b.build().unwrap()
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
fn scheme_set(schemes: &[&str]) -> Vec<String> {
|
|
71
|
+
schemes.iter().map(|s| s.to_ascii_lowercase()).collect()
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
#[test]
|
|
75
|
+
fn exact_host_matches() {
|
|
76
|
+
let s = host_set(&["example.net"]);
|
|
77
|
+
assert!(is_host_allowed("https://example.net/path", &s));
|
|
78
|
+
assert!(!is_host_allowed("https://evil.com/path", &s));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
#[test]
|
|
82
|
+
fn subdomain_wildcard() {
|
|
83
|
+
let s = host_set(&["*.example.net"]);
|
|
84
|
+
assert!(is_host_allowed("https://cdn.example.net/a.png", &s));
|
|
85
|
+
assert!(is_host_allowed("https://deeply.nested.example.net/x", &s));
|
|
86
|
+
assert!(!is_host_allowed("https://example.net/x", &s));
|
|
87
|
+
assert!(!is_host_allowed("https://evil.com/x", &s));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[test]
|
|
91
|
+
fn brace_alternation() {
|
|
92
|
+
let s = host_set(&["{cdn,static}.example.net"]);
|
|
93
|
+
assert!(is_host_allowed("https://cdn.example.net/x", &s));
|
|
94
|
+
assert!(is_host_allowed("https://static.example.net/x", &s));
|
|
95
|
+
assert!(!is_host_allowed("https://media.example.net/x", &s));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
#[test]
|
|
99
|
+
fn case_insensitive_host() {
|
|
100
|
+
let s = host_set(&["example.net"]);
|
|
101
|
+
assert!(is_host_allowed("https://EXAMPLE.NET/path", &s));
|
|
102
|
+
assert!(is_host_allowed("HTTPS://Example.Net/path", &s));
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#[test]
|
|
106
|
+
fn port_is_ignored() {
|
|
107
|
+
let s = host_set(&["example.net"]);
|
|
108
|
+
assert!(is_host_allowed("https://example.net:8443/x", &s));
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
#[test]
|
|
112
|
+
fn no_host_passes_through() {
|
|
113
|
+
let s = host_set(&["example.net"]);
|
|
114
|
+
assert!(is_host_allowed("/local/path", &s));
|
|
115
|
+
assert!(is_host_allowed("relative.html", &s));
|
|
116
|
+
assert!(is_host_allowed("#anchor", &s));
|
|
117
|
+
assert!(is_host_allowed("mailto:user@example.net", &s));
|
|
118
|
+
assert!(is_host_allowed("tel:+1234567890", &s));
|
|
119
|
+
assert!(is_host_allowed("javascript:alert(1)", &s));
|
|
120
|
+
assert!(is_host_allowed("", &s));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[test]
|
|
124
|
+
fn empty_host_allowlist_blocks_all_external() {
|
|
125
|
+
let s = host_set(&[]);
|
|
126
|
+
assert!(!is_host_allowed("https://example.net/x", &s));
|
|
127
|
+
assert!(!is_host_allowed("https://anything.com/x", &s));
|
|
128
|
+
assert!(is_host_allowed("/local", &s));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn scheme_matches_allowed() {
|
|
133
|
+
let s = scheme_set(&["http", "https", "mailto"]);
|
|
134
|
+
assert!(is_scheme_allowed("https://example.net/x", &s));
|
|
135
|
+
assert!(is_scheme_allowed("http://example.net/x", &s));
|
|
136
|
+
assert!(is_scheme_allowed("mailto:user@example.net", &s));
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
#[test]
|
|
140
|
+
fn scheme_rejects_disallowed() {
|
|
141
|
+
let s = scheme_set(&["http", "https", "mailto"]);
|
|
142
|
+
assert!(!is_scheme_allowed("javascript:alert(1)", &s));
|
|
143
|
+
assert!(!is_scheme_allowed("vbscript:msgbox", &s));
|
|
144
|
+
assert!(!is_scheme_allowed("data:text/html,<script>", &s));
|
|
145
|
+
assert!(!is_scheme_allowed("file:///etc/passwd", &s));
|
|
146
|
+
assert!(!is_scheme_allowed("tel:+1234567890", &s));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
#[test]
|
|
150
|
+
fn scheme_is_case_insensitive_via_url_crate() {
|
|
151
|
+
// url::Url lowercases the scheme at parse time, so mixed-case
|
|
152
|
+
// input matches lowercase allowlist entries.
|
|
153
|
+
let s = scheme_set(&["https"]);
|
|
154
|
+
assert!(is_scheme_allowed("HTTPS://example.net", &s));
|
|
155
|
+
assert!(is_scheme_allowed("HttpS://example.net", &s));
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#[test]
|
|
159
|
+
fn unparseable_url_passes_scheme_check() {
|
|
160
|
+
// Relative, anchor-only, protocol-relative, and empty URLs can't
|
|
161
|
+
// be parsed as absolute—no scheme to check, so they pass.
|
|
162
|
+
let s = scheme_set(&["https"]);
|
|
163
|
+
assert!(is_scheme_allowed("/local/path", &s));
|
|
164
|
+
assert!(is_scheme_allowed("relative.html", &s));
|
|
165
|
+
assert!(is_scheme_allowed("#anchor", &s));
|
|
166
|
+
assert!(is_scheme_allowed("//cdn.example.net/x", &s));
|
|
167
|
+
assert!(is_scheme_allowed("", &s));
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
#[test]
|
|
171
|
+
fn empty_scheme_allowlist_blocks_all_absolute() {
|
|
172
|
+
let s = scheme_set(&[]);
|
|
173
|
+
assert!(!is_scheme_allowed("https://example.net", &s));
|
|
174
|
+
assert!(!is_scheme_allowed("mailto:user@example.net", &s));
|
|
175
|
+
// Relative URLs still pass—nothing to match.
|
|
176
|
+
assert!(is_scheme_allowed("/local", &s));
|
|
177
|
+
}
|
|
178
|
+
}
|