inkmark 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,221 @@
1
+ //! Table-of-contents rendering.
2
+ //!
3
+ //! Converts a slice of `TocEntry` values (produced by `stats::collect`)
4
+ //! into markdown or HTML strings.
5
+
6
+ use pulldown_cmark::HeadingLevel;
7
+ use pulldown_cmark_escape::escape_html;
8
+
9
+ pub struct TocEntry {
10
+ pub level: HeadingLevel,
11
+ pub text: String,
12
+ pub slug: String,
13
+ }
14
+
15
+ /// Render TOC entries as a markdown list.
16
+ ///
17
+ /// Square brackets in heading text are escaped (`[` → `\[`, `]` → `\]`) so
18
+ /// they don't break the markdown link syntax `[text](#slug)`.
19
+ ///
20
+ /// When `max_depth` is `Some(n)`, only headings at level ≤ n are included.
21
+ /// `None` means no depth filtering (every heading appears).
22
+ pub fn toc_to_markdown(entries: &[TocEntry], max_depth: Option<u8>) -> String {
23
+ let filtered: Vec<&TocEntry> = entries
24
+ .iter()
25
+ .filter(|e| max_depth.map_or(true, |max| level_to_u8(e.level) <= max))
26
+ .collect();
27
+ if filtered.is_empty() {
28
+ return String::new();
29
+ }
30
+ let min_level = filtered
31
+ .iter()
32
+ .map(|e| level_to_u8(e.level))
33
+ .min()
34
+ .unwrap_or(1);
35
+
36
+ let mut buf = String::new();
37
+ for entry in &filtered {
38
+ let indent = (level_to_u8(entry.level) - min_level) as usize * 2;
39
+ for _ in 0..indent {
40
+ buf.push(' ');
41
+ }
42
+ buf.push_str("- [");
43
+ // Escape [ and ] so they don't break the markdown link syntax.
44
+ for ch in entry.text.chars() {
45
+ match ch {
46
+ '[' => buf.push_str("\\["),
47
+ ']' => buf.push_str("\\]"),
48
+ c => buf.push(c),
49
+ }
50
+ }
51
+ buf.push_str("](#");
52
+ buf.push_str(&entry.slug);
53
+ buf.push_str(")\n");
54
+ }
55
+ buf
56
+ }
57
+
58
+ /// Render TOC entries as a nested HTML `<ul>` list.
59
+ ///
60
+ /// When `max_depth` is `Some(n)`, only headings at level ≤ n are included.
61
+ pub fn toc_to_html(entries: &[TocEntry], max_depth: Option<u8>) -> String {
62
+ let filtered: Vec<&TocEntry> = entries
63
+ .iter()
64
+ .filter(|e| max_depth.map_or(true, |max| level_to_u8(e.level) <= max))
65
+ .collect();
66
+ if filtered.is_empty() {
67
+ return String::new();
68
+ }
69
+ let min_level = filtered
70
+ .iter()
71
+ .map(|e| level_to_u8(e.level))
72
+ .min()
73
+ .unwrap_or(1);
74
+ let mut buf = String::new();
75
+ let mut open_levels: Vec<u8> = Vec::new();
76
+
77
+ for entry in &filtered {
78
+ let lvl = level_to_u8(entry.level);
79
+
80
+ while open_levels
81
+ .last()
82
+ .copied()
83
+ .unwrap_or(min_level.saturating_sub(1))
84
+ >= lvl
85
+ {
86
+ buf.push_str("</li>\n</ul>\n");
87
+ open_levels.pop();
88
+ }
89
+
90
+ while open_levels
91
+ .last()
92
+ .copied()
93
+ .unwrap_or(min_level.saturating_sub(1))
94
+ < lvl
95
+ {
96
+ buf.push_str("<ul>\n");
97
+ open_levels.push(lvl);
98
+ }
99
+
100
+ buf.push_str("<li><a href=\"#");
101
+ buf.push_str(&entry.slug);
102
+ buf.push_str("\">");
103
+
104
+ let _ = escape_html(&mut buf, &entry.text);
105
+ buf.push_str("</a>\n");
106
+ }
107
+
108
+ while open_levels.pop().is_some() {
109
+ buf.push_str("</li>\n</ul>\n");
110
+ }
111
+
112
+ buf
113
+ }
114
+
115
+ #[inline]
116
+ pub fn level_to_u8(level: HeadingLevel) -> u8 {
117
+ match level {
118
+ HeadingLevel::H1 => 1,
119
+ HeadingLevel::H2 => 2,
120
+ HeadingLevel::H3 => 3,
121
+ HeadingLevel::H4 => 4,
122
+ HeadingLevel::H5 => 5,
123
+ HeadingLevel::H6 => 6,
124
+ }
125
+ }
126
+
127
+ #[cfg(test)]
128
+ mod tests {
129
+ use super::*;
130
+ use pulldown_cmark::HeadingLevel;
131
+
132
+ fn entry(level: HeadingLevel, text: &str, slug: &str) -> TocEntry {
133
+ TocEntry {
134
+ level,
135
+ text: text.to_string(),
136
+ slug: slug.to_string(),
137
+ }
138
+ }
139
+
140
+ #[test]
141
+ fn empty_toc_returns_empty_string() {
142
+ assert_eq!(toc_to_markdown(&[], None), "");
143
+ assert_eq!(toc_to_html(&[], None), "");
144
+ }
145
+
146
+ #[test]
147
+ fn markdown_escapes_brackets_in_heading_text() {
148
+ let entries = vec![entry(HeadingLevel::H1, "Arrays [1..n]", "arrays-1-n")];
149
+ let md = toc_to_markdown(&entries, None);
150
+ assert!(md.contains("\\[1..n\\]"), "brackets must be escaped: {md}");
151
+ assert!(md.contains("(#arrays-1-n)"));
152
+ }
153
+
154
+ #[test]
155
+ fn markdown_simple_toc() {
156
+ let entries = vec![
157
+ entry(HeadingLevel::H1, "Introduction", "introduction"),
158
+ entry(HeadingLevel::H2, "Background", "background"),
159
+ ];
160
+ let md = toc_to_markdown(&entries, None);
161
+ assert_eq!(
162
+ md,
163
+ "- [Introduction](#introduction)\n - [Background](#background)\n"
164
+ );
165
+ }
166
+
167
+ #[test]
168
+ fn html_toc_escapes_text() {
169
+ let entries = vec![entry(HeadingLevel::H1, "A & B <C>", "a-b-c")];
170
+ let html = toc_to_html(&entries, None);
171
+ assert!(html.contains("A &amp; B &lt;C&gt;"));
172
+ }
173
+
174
+ #[test]
175
+ fn max_depth_filters_markdown() {
176
+ let entries = vec![
177
+ entry(HeadingLevel::H1, "Top", "top"),
178
+ entry(HeadingLevel::H2, "Mid", "mid"),
179
+ entry(HeadingLevel::H3, "Deep", "deep"),
180
+ ];
181
+ let md = toc_to_markdown(&entries, Some(2));
182
+ assert!(md.contains("[Top]"));
183
+ assert!(md.contains("[Mid]"));
184
+ assert!(!md.contains("[Deep]"));
185
+ }
186
+
187
+ #[test]
188
+ fn max_depth_filters_html() {
189
+ let entries = vec![
190
+ entry(HeadingLevel::H1, "Top", "top"),
191
+ entry(HeadingLevel::H2, "Mid", "mid"),
192
+ entry(HeadingLevel::H3, "Deep", "deep"),
193
+ ];
194
+ let html = toc_to_html(&entries, Some(2));
195
+ assert!(html.contains(">Top</a>"));
196
+ assert!(html.contains(">Mid</a>"));
197
+ assert!(!html.contains(">Deep</a>"));
198
+ }
199
+
200
+ #[test]
201
+ fn max_depth_one_keeps_only_h1() {
202
+ let entries = vec![
203
+ entry(HeadingLevel::H1, "Top", "top"),
204
+ entry(HeadingLevel::H2, "Mid", "mid"),
205
+ ];
206
+ let md = toc_to_markdown(&entries, Some(1));
207
+ assert_eq!(md, "- [Top](#top)\n");
208
+ }
209
+
210
+ #[test]
211
+ fn max_depth_respects_remaining_entries_for_min_level() {
212
+ // With h3 filtered out, min_level is h1, so h2 gets 2-space indent.
213
+ let entries = vec![
214
+ entry(HeadingLevel::H1, "Top", "top"),
215
+ entry(HeadingLevel::H2, "Mid", "mid"),
216
+ entry(HeadingLevel::H3, "Deep", "deep"),
217
+ ];
218
+ let md = toc_to_markdown(&entries, Some(2));
219
+ assert_eq!(md, "- [Top](#top)\n - [Mid](#mid)\n");
220
+ }
221
+ }
@@ -0,0 +1,267 @@
1
+ //! Markdown truncation for LLM / RAG pipelines.
2
+ //!
3
+ //! Cuts a filter-applied event stream at either a block boundary
4
+ //! (`TruncateAt::Block`) or a Unicode word boundary
5
+ //! (`TruncateAt::Word`), respecting optional character and word
6
+ //! budgets. Designed as a first-stage preprocessor for embedding
7
+ //! input, context-window budgeting, and chunk normalization.
8
+
9
+ use magnus::{Error, RHash, Ruby};
10
+ use pulldown_cmark::{Event, Parser};
11
+ use unicode_segmentation::UnicodeSegmentation;
12
+
13
+ use crate::document::apply_filters;
14
+ use crate::options::{build_options, Flags};
15
+
16
+ /// What kind of boundary to cut at.
17
+ #[derive(Clone, Copy, PartialEq, Eq)]
18
+ pub enum TruncateAt {
19
+ /// Last top-level Markdown block that fits the budget. Output is
20
+ /// always valid Markdown.
21
+ Block,
22
+ /// Last Unicode word boundary that fits the budget. Output is a
23
+ /// Markdown string but may split an open construct (code fence,
24
+ /// link, emphasis).
25
+ Word,
26
+ }
27
+
28
+ /// Parameters for a truncation pass. At least one of `chars` / `words`
29
+ /// must be `Some`; if both are set, cut when the first of the two
30
+ /// budgets is exhausted. `marker` ("...") counts toward the budget:
31
+ /// if supplied, the effective content budget is reduced so that final
32
+ /// output length stays at or under the user-given limit.
33
+ pub struct TruncateParams {
34
+ pub chars: Option<usize>,
35
+ pub words: Option<usize>,
36
+ pub at: TruncateAt,
37
+ pub marker: Option<String>,
38
+ }
39
+
40
+ /// Full-document entry point: parse + filter + truncate.
41
+ pub fn truncate_source(
42
+ source: &str,
43
+ cm_opts: pulldown_cmark::Options,
44
+ flags: &Flags,
45
+ params: &TruncateParams,
46
+ ) -> String {
47
+ let events: Vec<Event> = Parser::new_ext(source, cm_opts).collect();
48
+ let events = apply_filters(events, flags);
49
+ truncate_events(&events, params)
50
+ }
51
+
52
+ /// Ruby-facing entry point. Expects `params_hash` to contain
53
+ /// `:chars`, `:words`, `:at` (`:block` | `:word`), and `:marker`
54
+ /// (a String or nil). Argument validation lives on the Ruby side;
55
+ /// we just read the values defensively here.
56
+ pub fn native_truncate_markdown(
57
+ ruby: &Ruby,
58
+ source: String,
59
+ params_hash: RHash,
60
+ opts_hash: RHash,
61
+ ) -> Result<String, Error> {
62
+ let (cm_opts, flags) = build_options(ruby, opts_hash)?;
63
+ let params = parse_params(ruby, params_hash)?;
64
+ Ok(truncate_source(&source, cm_opts, &flags, &params))
65
+ }
66
+
67
+ pub fn parse_params(ruby: &Ruby, hash: RHash) -> Result<TruncateParams, Error> {
68
+ let chars: Option<usize> = hash.lookup(ruby.to_symbol("chars"))?;
69
+ let words: Option<usize> = hash.lookup(ruby.to_symbol("words"))?;
70
+ let at_sym: Option<String> = hash.lookup(ruby.to_symbol("at"))?;
71
+ let marker: Option<String> = hash.lookup(ruby.to_symbol("marker"))?;
72
+
73
+ let at = match at_sym.as_deref() {
74
+ Some("block") => TruncateAt::Block,
75
+ Some("word") => TruncateAt::Word,
76
+ _ => TruncateAt::Block,
77
+ };
78
+
79
+ Ok(TruncateParams {
80
+ chars,
81
+ words,
82
+ at,
83
+ marker,
84
+ })
85
+ }
86
+
87
+ /// Core truncation over a filter-applied event slice.
88
+ pub fn truncate_events(events: &[Event<'_>], params: &TruncateParams) -> String {
89
+ match params.at {
90
+ TruncateAt::Block => truncate_at_block(events, params),
91
+ TruncateAt::Word => truncate_at_word(events, params),
92
+ }
93
+ }
94
+
95
+ fn truncate_at_block(events: &[Event<'_>], params: &TruncateParams) -> String {
96
+ let marker_chars = marker_chars(&params.marker);
97
+ let marker_words = marker_words(&params.marker);
98
+ let char_budget = params.chars.map(|n| n.saturating_sub(marker_chars));
99
+ let word_budget = params.words.map(|n| n.saturating_sub(marker_words));
100
+
101
+ let mut kept = String::new();
102
+ let mut used_chars: usize = 0;
103
+ let mut used_words: usize = 0;
104
+ let mut any_dropped = false;
105
+
106
+ for (start, end) in top_level_blocks(events) {
107
+ let block = render_markdown(&events[start..=end]);
108
+ let block_chars = block.trim_end().chars().count();
109
+ let block_words = block.unicode_words().count();
110
+
111
+ let would_exceed_chars = char_budget
112
+ .map(|b| used_chars + block_chars > b)
113
+ .unwrap_or(false);
114
+ let would_exceed_words = word_budget
115
+ .map(|b| used_words + block_words > b)
116
+ .unwrap_or(false);
117
+
118
+ if would_exceed_chars || would_exceed_words {
119
+ any_dropped = true;
120
+ break;
121
+ }
122
+
123
+ kept.push_str(&block);
124
+ used_chars += block_chars;
125
+ used_words += block_words;
126
+ }
127
+
128
+ // If nothing was dropped and source fit, return unchanged (no marker).
129
+ if !any_dropped {
130
+ return kept;
131
+ }
132
+
133
+ // First block alone exceeded the budget; honest empty return
134
+ if kept.is_empty() {
135
+ return String::new();
136
+ }
137
+
138
+ // Trim trailing whitespace before appending the marker so the
139
+ // ellipsis attaches cleanly to the last block.
140
+ while kept.ends_with(|c: char| c.is_whitespace()) {
141
+ kept.pop();
142
+ }
143
+ if let Some(marker) = &params.marker {
144
+ kept.push_str("\n\n");
145
+ kept.push_str(marker);
146
+ }
147
+ kept.push('\n');
148
+ kept
149
+ }
150
+
151
+ /// Return (start, end) event-index pairs for top-level blocks.
152
+ /// A block starts at an `Event::Start` with depth 0 and ends at the
153
+ /// matching `Event::End`. Leaf events at depth 0 (Rule, HardBreak if
154
+ /// it ever happens, raw Html block) count as single-event blocks.
155
+ fn top_level_blocks(events: &[Event<'_>]) -> Vec<(usize, usize)> {
156
+ let mut blocks = Vec::new();
157
+ let mut depth: i32 = 0;
158
+ let mut current_start: Option<usize> = None;
159
+
160
+ for (i, event) in events.iter().enumerate() {
161
+ match event {
162
+ Event::Start(_) => {
163
+ if depth == 0 {
164
+ current_start = Some(i);
165
+ }
166
+ depth += 1;
167
+ }
168
+ Event::End(_) => {
169
+ depth -= 1;
170
+ if depth == 0 {
171
+ if let Some(start) = current_start.take() {
172
+ blocks.push((start, i));
173
+ }
174
+ }
175
+ }
176
+ _ => {
177
+ if depth == 0 {
178
+ // Standalone top-level event (Event::Rule, etc.).
179
+ blocks.push((i, i));
180
+ }
181
+ }
182
+ }
183
+ }
184
+
185
+ blocks
186
+ }
187
+
188
+ fn render_markdown(events: &[Event<'_>]) -> String {
189
+ let mut buf = String::new();
190
+ pulldown_cmark_to_cmark::cmark(events.iter().cloned(), &mut buf)
191
+ .expect("markdown serialization failed");
192
+ buf
193
+ }
194
+
195
+ fn truncate_at_word(events: &[Event<'_>], params: &TruncateParams) -> String {
196
+ let rendered = render_markdown(events);
197
+ let marker_chars = marker_chars(&params.marker);
198
+ let marker_words = marker_words(&params.marker);
199
+
200
+ let total_chars = rendered.chars().count();
201
+ let total_words = rendered.unicode_words().count();
202
+
203
+ let chars_ok = params
204
+ .chars
205
+ .map(|limit| total_chars + marker_chars <= limit)
206
+ .unwrap_or(true);
207
+ let words_ok = params
208
+ .words
209
+ .map(|limit| total_words + marker_words <= limit)
210
+ .unwrap_or(true);
211
+
212
+ // Fits under both budgets: return unchanged, no marker.
213
+ if chars_ok && words_ok {
214
+ return rendered;
215
+ }
216
+
217
+ let char_budget = params.chars.map(|n| n.saturating_sub(marker_chars));
218
+ let word_budget = params.words.map(|n| n.saturating_sub(marker_words));
219
+
220
+ // Walk word boundaries, tracking cumulative char and word counts.
221
+ // `last_good_end` is the byte offset of the end of the last word
222
+ // segment that stays within both budgets.
223
+ let mut last_good_end: usize = 0;
224
+ let mut used_chars: usize = 0;
225
+ let mut used_words: usize = 0;
226
+
227
+ for (offset, segment) in rendered.split_word_bound_indices() {
228
+ let seg_chars = segment.chars().count();
229
+ let seg_is_word = segment.unicode_words().next().is_some();
230
+ let next_words = if seg_is_word {
231
+ used_words + 1
232
+ } else {
233
+ used_words
234
+ };
235
+ let next_chars = used_chars + seg_chars;
236
+
237
+ let over_chars = char_budget.map(|b| next_chars > b).unwrap_or(false);
238
+ let over_words = word_budget.map(|b| next_words > b).unwrap_or(false);
239
+ if over_chars || over_words {
240
+ break;
241
+ }
242
+
243
+ used_chars = next_chars;
244
+ used_words = next_words;
245
+ last_good_end = offset + segment.len();
246
+ }
247
+
248
+ let mut out = rendered[..last_good_end].to_string();
249
+ while out.ends_with(|c: char| c.is_whitespace()) {
250
+ out.pop();
251
+ }
252
+ if let Some(marker) = &params.marker {
253
+ out.push_str(marker);
254
+ }
255
+ out
256
+ }
257
+
258
+ fn marker_chars(marker: &Option<String>) -> usize {
259
+ marker.as_ref().map(|m| m.chars().count()).unwrap_or(0)
260
+ }
261
+
262
+ fn marker_words(marker: &Option<String>) -> usize {
263
+ marker
264
+ .as_ref()
265
+ .map(|m| m.unicode_words().count())
266
+ .unwrap_or(0)
267
+ }
@@ -0,0 +1,178 @@
1
+ //! URL-allowlist matching shared between the link and image filters.
2
+ //!
3
+ //! Two entry points, both using `url::Url::parse` as the single source
4
+ //! of truth for URL decomposition:
5
+ //!
6
+ //! - [`is_host_allowed`]: match the URL's host against a glob allowlist.
7
+ //! - [`is_scheme_allowed`]: match the URL's scheme against a string set.
8
+ //!
9
+ //! Both fail-open on URLs the parser can't resolve (relative paths,
10
+ //! anchors, protocol-relative, malformed input): such URLs have nothing
11
+ //! to check against, so they pass through unchanged.
12
+
13
+ use globset::GlobSet;
14
+
15
+ /// Return true when the URL should be kept by a host allowlist.
16
+ ///
17
+ /// - **Has a host** (http/https URLs): lowercase and match against `set`.
18
+ /// - **No host** (relative `/foo`, anchor `#x`, `mailto:`, `tel:`,
19
+ /// `javascript:`, malformed input): out of scope for host allowlisting,
20
+ /// return true so the caller leaves it alone.
21
+ ///
22
+ /// Fail-open on parse failure is deliberate: relative URLs must pass
23
+ /// through unchanged, and "can't parse" is how `url` signals that.
24
+ pub fn is_host_allowed(url: &str, set: &GlobSet) -> bool {
25
+ match url::Url::parse(url)
26
+ .ok()
27
+ .and_then(|u| u.host_str().map(str::to_ascii_lowercase))
28
+ {
29
+ Some(host) => set.is_match(&host),
30
+ None => true,
31
+ }
32
+ }
33
+
34
+ /// Return true when the URL's scheme is in the allowlist.
35
+ ///
36
+ /// - **Parses as absolute URL**: check the scheme (lowercased by `url::Url`)
37
+ /// against `allowed`.
38
+ /// - **Doesn't parse** (relative `/foo`, anchor `#x`, protocol-relative
39
+ /// `//host/x`, malformed input): no scheme to check, return true.
40
+ ///
41
+ /// `allowed` is a `&[String]` rather than a `HashSet` because every
42
+ /// realistic scheme allowlist has 2–5 entries, a linear scan of short
43
+ /// strings beats any hash table on CPU cache alone at that size. The
44
+ /// caller must pre-lowercase entries; `url::Url` normalizes schemes to
45
+ /// lowercase at parse time, so comparing against lowercase entries is
46
+ /// correct.
47
+ pub fn is_scheme_allowed(url: &str, allowed: &[String]) -> bool {
48
+ match url::Url::parse(url) {
49
+ Ok(parsed) => {
50
+ let scheme = parsed.scheme();
51
+ allowed.iter().any(|s| s == scheme)
52
+ }
53
+ Err(_) => true,
54
+ }
55
+ }
56
+
57
+ #[cfg(test)]
58
+ mod tests {
59
+ use super::{is_host_allowed, is_scheme_allowed};
60
+ use globset::{Glob, GlobSetBuilder};
61
+
62
+ fn host_set(patterns: &[&str]) -> globset::GlobSet {
63
+ let mut b = GlobSetBuilder::new();
64
+ for p in patterns {
65
+ b.add(Glob::new(p).unwrap());
66
+ }
67
+ b.build().unwrap()
68
+ }
69
+
70
+ fn scheme_set(schemes: &[&str]) -> Vec<String> {
71
+ schemes.iter().map(|s| s.to_ascii_lowercase()).collect()
72
+ }
73
+
74
+ #[test]
75
+ fn exact_host_matches() {
76
+ let s = host_set(&["example.net"]);
77
+ assert!(is_host_allowed("https://example.net/path", &s));
78
+ assert!(!is_host_allowed("https://evil.com/path", &s));
79
+ }
80
+
81
+ #[test]
82
+ fn subdomain_wildcard() {
83
+ let s = host_set(&["*.example.net"]);
84
+ assert!(is_host_allowed("https://cdn.example.net/a.png", &s));
85
+ assert!(is_host_allowed("https://deeply.nested.example.net/x", &s));
86
+ assert!(!is_host_allowed("https://example.net/x", &s));
87
+ assert!(!is_host_allowed("https://evil.com/x", &s));
88
+ }
89
+
90
+ #[test]
91
+ fn brace_alternation() {
92
+ let s = host_set(&["{cdn,static}.example.net"]);
93
+ assert!(is_host_allowed("https://cdn.example.net/x", &s));
94
+ assert!(is_host_allowed("https://static.example.net/x", &s));
95
+ assert!(!is_host_allowed("https://media.example.net/x", &s));
96
+ }
97
+
98
+ #[test]
99
+ fn case_insensitive_host() {
100
+ let s = host_set(&["example.net"]);
101
+ assert!(is_host_allowed("https://EXAMPLE.NET/path", &s));
102
+ assert!(is_host_allowed("HTTPS://Example.Net/path", &s));
103
+ }
104
+
105
+ #[test]
106
+ fn port_is_ignored() {
107
+ let s = host_set(&["example.net"]);
108
+ assert!(is_host_allowed("https://example.net:8443/x", &s));
109
+ }
110
+
111
+ #[test]
112
+ fn no_host_passes_through() {
113
+ let s = host_set(&["example.net"]);
114
+ assert!(is_host_allowed("/local/path", &s));
115
+ assert!(is_host_allowed("relative.html", &s));
116
+ assert!(is_host_allowed("#anchor", &s));
117
+ assert!(is_host_allowed("mailto:user@example.net", &s));
118
+ assert!(is_host_allowed("tel:+1234567890", &s));
119
+ assert!(is_host_allowed("javascript:alert(1)", &s));
120
+ assert!(is_host_allowed("", &s));
121
+ }
122
+
123
+ #[test]
124
+ fn empty_host_allowlist_blocks_all_external() {
125
+ let s = host_set(&[]);
126
+ assert!(!is_host_allowed("https://example.net/x", &s));
127
+ assert!(!is_host_allowed("https://anything.com/x", &s));
128
+ assert!(is_host_allowed("/local", &s));
129
+ }
130
+
131
+ #[test]
132
+ fn scheme_matches_allowed() {
133
+ let s = scheme_set(&["http", "https", "mailto"]);
134
+ assert!(is_scheme_allowed("https://example.net/x", &s));
135
+ assert!(is_scheme_allowed("http://example.net/x", &s));
136
+ assert!(is_scheme_allowed("mailto:user@example.net", &s));
137
+ }
138
+
139
+ #[test]
140
+ fn scheme_rejects_disallowed() {
141
+ let s = scheme_set(&["http", "https", "mailto"]);
142
+ assert!(!is_scheme_allowed("javascript:alert(1)", &s));
143
+ assert!(!is_scheme_allowed("vbscript:msgbox", &s));
144
+ assert!(!is_scheme_allowed("data:text/html,<script>", &s));
145
+ assert!(!is_scheme_allowed("file:///etc/passwd", &s));
146
+ assert!(!is_scheme_allowed("tel:+1234567890", &s));
147
+ }
148
+
149
+ #[test]
150
+ fn scheme_is_case_insensitive_via_url_crate() {
151
+ // url::Url lowercases the scheme at parse time, so mixed-case
152
+ // input matches lowercase allowlist entries.
153
+ let s = scheme_set(&["https"]);
154
+ assert!(is_scheme_allowed("HTTPS://example.net", &s));
155
+ assert!(is_scheme_allowed("HttpS://example.net", &s));
156
+ }
157
+
158
+ #[test]
159
+ fn unparseable_url_passes_scheme_check() {
160
+ // Relative, anchor-only, protocol-relative, and empty URLs can't
161
+ // be parsed as absolute—no scheme to check, so they pass.
162
+ let s = scheme_set(&["https"]);
163
+ assert!(is_scheme_allowed("/local/path", &s));
164
+ assert!(is_scheme_allowed("relative.html", &s));
165
+ assert!(is_scheme_allowed("#anchor", &s));
166
+ assert!(is_scheme_allowed("//cdn.example.net/x", &s));
167
+ assert!(is_scheme_allowed("", &s));
168
+ }
169
+
170
+ #[test]
171
+ fn empty_scheme_allowlist_blocks_all_absolute() {
172
+ let s = scheme_set(&[]);
173
+ assert!(!is_scheme_allowed("https://example.net", &s));
174
+ assert!(!is_scheme_allowed("mailto:user@example.net", &s));
175
+ // Relative URLs still pass—nothing to match.
176
+ assert!(is_scheme_allowed("/local", &s));
177
+ }
178
+ }