inkmark 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,453 @@
1
+ //! Document statistics and table-of-contents collector.
2
+ //!
3
+ //! Walks a slice of `(Event, byte_range)` tuples once (before filters) and collects:
4
+ //! - Text buffer → character count, word count, language detection
5
+ //! - Heading entries → heading count, TOC (markdown + HTML), heading extract
6
+ //! - Code block count + raw source extract
7
+ //! - Image and link extract metadata
8
+ //! - Footnote definition count + body extract
9
+ //!
10
+ //! Byte ranges come from pulldown-cmark's `OffsetIter`: the Start tag's
11
+ //! range spans the whole source element (e.g. the entire
12
+ //! `` ```ruby\n...\n``` ``
13
+ //! for a fenced code block). The caller (`document.rs`) parses with
14
+ //! `Parser::new_ext(...).into_offset_iter()` and hands us the result.
15
+ //!
16
+ //! The collector is the single source of truth for the full-render path.
17
+ //! Two independent Ruby-side knobs consume its output:
18
+ //! - `statistics: true` => scalar counts and language
19
+ //! detection (`to_statistics_hash`)
20
+ //! - `extract: {...}` => filtered arrays of structured
21
+ //! records (`to_extracts_hash`)
22
+
23
+ use std::ops::Range;
24
+
25
+ use magnus::{Error, RArray, RHash, Ruby};
26
+ use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Tag, TagEnd};
27
+ use unicode_segmentation::UnicodeSegmentation;
28
+
29
+ use crate::heading::{self, SlugDeduplicator};
30
+ use crate::toc::{self, TocEntry};
31
+
32
+ pub struct ImageInfo {
33
+ pub src: String,
34
+ pub alt: String,
35
+ pub title: String,
36
+ pub byte_range: Range<usize>,
37
+ }
38
+
39
+ pub struct LinkInfo {
40
+ pub href: String,
41
+ pub text: String,
42
+ pub title: String,
43
+ pub byte_range: Range<usize>,
44
+ }
45
+
46
+ /// A single fenced or indented code block captured before any filter
47
+ /// runs: the `source` is pulldown-cmark's unmodified content, suitable
48
+ /// for passing to an external highlighter.
49
+ /// `lang` is the info string on a fence (e.g. `"ruby"`); indented code
50
+ /// blocks carry the empty string, matching the handler API.
51
+ pub struct CodeBlockInfo {
52
+ pub lang: String,
53
+ pub source: String,
54
+ pub byte_range: Range<usize>,
55
+ }
56
+
57
+ /// A footnote definition `[^label]: body`. `text` is the plain-text body:
58
+ /// emphasis, links, and inline formatting are flattened to their text
59
+ /// content, matching how `ImageInfo.alt` and `LinkInfo.text` are captured.
60
+ pub struct FootnoteDefInfo {
61
+ pub label: String,
62
+ pub text: String,
63
+ pub byte_range: Range<usize>,
64
+ }
65
+
66
+ /// Heading record for the `extract[:headings]` projection. Parallel to
67
+ /// `toc::TocEntry` but adds a byte range and uses the `id`/extract
68
+ /// vocabulary. We push to both during the walk: it's one allocation per
69
+ /// heading, and keeps the TOC data type free of byte-range baggage that
70
+ /// its renderer ignores.
71
+ pub struct HeadingInfo {
72
+ pub level: HeadingLevel,
73
+ pub text: String,
74
+ pub id: String,
75
+ pub byte_range: Range<usize>,
76
+ }
77
+
78
+ pub struct Stats {
79
+ pub text_buf: String,
80
+ pub heading_count: usize,
81
+ pub code_blocks: Vec<CodeBlockInfo>,
82
+ pub images: Vec<ImageInfo>,
83
+ pub links: Vec<LinkInfo>,
84
+ pub footnote_definitions: Vec<FootnoteDefInfo>,
85
+ pub headings: Vec<HeadingInfo>,
86
+ pub toc_entries: Vec<TocEntry>,
87
+ pub frontmatter: Option<String>,
88
+ }
89
+
90
+ /// Which extract arrays to serialize into the Ruby-side `:extracts` hash.
91
+ /// Flags map 1:1 to the Ruby-facing `extract: { ... }` hash keys.
92
+ #[derive(Default, Clone, Copy)]
93
+ pub struct ExtractFlags {
94
+ pub images: bool,
95
+ pub links: bool,
96
+ pub code_blocks: bool,
97
+ pub headings: bool,
98
+ pub footnote_definitions: bool,
99
+ }
100
+
101
+ impl ExtractFlags {
102
+ pub fn any(&self) -> bool {
103
+ self.images || self.links || self.code_blocks || self.headings || self.footnote_definitions
104
+ }
105
+ }
106
+
107
+ /// Walk events and collect all statistics + TOC entries in one pass.
108
+ /// Call BEFORE filters so we measure original content. Each event
109
+ /// arrives paired with the byte range of its source span: the Start
110
+ /// tag's range is what gets attached to the corresponding extract
111
+ /// record.
112
+ pub fn collect(events: &[(Event<'_>, Range<usize>)]) -> Stats {
113
+ let mut text_buf = String::new();
114
+ let mut code_blocks: Vec<CodeBlockInfo> = Vec::new();
115
+ let mut images: Vec<ImageInfo> = Vec::new();
116
+ let mut links: Vec<LinkInfo> = Vec::new();
117
+ let mut footnote_definitions: Vec<FootnoteDefInfo> = Vec::new();
118
+ let mut frontmatter: Option<String> = None;
119
+ let mut in_metadata_block = false;
120
+
121
+ let mut in_code_block = false;
122
+ let mut in_image = false;
123
+ let mut in_link = false;
124
+ let mut in_footnote_def = false;
125
+ let mut image_alt = String::new();
126
+ let mut link_text = String::new();
127
+ let mut current_code_block: Option<CodeBlockInfo> = None;
128
+ let mut current_image: Option<ImageInfo> = None;
129
+ let mut current_link: Option<LinkInfo> = None;
130
+ let mut current_footnote_def: Option<FootnoteDefInfo> = None;
131
+
132
+ let mut toc_entries: Vec<TocEntry> = Vec::new();
133
+ let mut headings: Vec<HeadingInfo> = Vec::new();
134
+ let mut dedup = SlugDeduplicator::new();
135
+ let mut in_heading = false;
136
+ let mut current_heading_level = HeadingLevel::H1;
137
+ let mut current_heading_text = String::new();
138
+ let mut current_heading_range: Range<usize> = 0..0;
139
+
140
+ for (event, range) in events {
141
+ match event {
142
+ // Headings
143
+ Event::Start(Tag::Heading { level, .. }) => {
144
+ in_heading = true;
145
+ current_heading_level = *level;
146
+ current_heading_text.clear();
147
+ current_heading_range = range.clone();
148
+ }
149
+ Event::End(TagEnd::Heading(_)) if in_heading => {
150
+ in_heading = false;
151
+ let base = heading::slugify(&current_heading_text);
152
+ if !base.is_empty() {
153
+ let slug = dedup.deduplicate(base);
154
+ toc_entries.push(TocEntry {
155
+ level: current_heading_level,
156
+ text: current_heading_text.clone(),
157
+ slug: slug.clone(),
158
+ });
159
+ headings.push(HeadingInfo {
160
+ level: current_heading_level,
161
+ text: current_heading_text.clone(),
162
+ id: slug,
163
+ byte_range: current_heading_range.clone(),
164
+ });
165
+ }
166
+ }
167
+
168
+ // Frontmatter
169
+ Event::Start(Tag::MetadataBlock(_)) => {
170
+ in_metadata_block = true;
171
+ }
172
+ Event::End(TagEnd::MetadataBlock(_)) => {
173
+ in_metadata_block = false;
174
+ }
175
+
176
+ // Code blocks
177
+ Event::Start(Tag::CodeBlock(kind)) => {
178
+ let lang = match kind {
179
+ CodeBlockKind::Fenced(lang) => lang.to_string(),
180
+ CodeBlockKind::Indented => String::new(),
181
+ };
182
+ current_code_block = Some(CodeBlockInfo {
183
+ lang,
184
+ source: String::new(),
185
+ byte_range: range.clone(),
186
+ });
187
+ in_code_block = true;
188
+ }
189
+ Event::End(TagEnd::CodeBlock) => {
190
+ in_code_block = false;
191
+ if let Some(block) = current_code_block.take() {
192
+ code_blocks.push(block);
193
+ }
194
+ }
195
+
196
+ // Images
197
+ Event::Start(Tag::Image {
198
+ dest_url, title, ..
199
+ }) => {
200
+ in_image = true;
201
+ image_alt.clear();
202
+ current_image = Some(ImageInfo {
203
+ src: dest_url.to_string(),
204
+ alt: String::new(),
205
+ title: title.to_string(),
206
+ byte_range: range.clone(),
207
+ });
208
+ }
209
+ Event::End(TagEnd::Image) => {
210
+ in_image = false;
211
+ if let Some(mut img) = current_image.take() {
212
+ img.alt = image_alt.clone();
213
+ images.push(img);
214
+ }
215
+ }
216
+
217
+ // Links
218
+ Event::Start(Tag::Link {
219
+ dest_url, title, ..
220
+ }) => {
221
+ in_link = true;
222
+ link_text.clear();
223
+ current_link = Some(LinkInfo {
224
+ href: dest_url.to_string(),
225
+ text: String::new(),
226
+ title: title.to_string(),
227
+ byte_range: range.clone(),
228
+ });
229
+ }
230
+ Event::End(TagEnd::Link) => {
231
+ in_link = false;
232
+ if let Some(mut lnk) = current_link.take() {
233
+ lnk.text = link_text.clone();
234
+ links.push(lnk);
235
+ }
236
+ }
237
+
238
+ // Footnote definitions
239
+ Event::Start(Tag::FootnoteDefinition(label)) => {
240
+ in_footnote_def = true;
241
+ current_footnote_def = Some(FootnoteDefInfo {
242
+ label: label.to_string(),
243
+ text: String::new(),
244
+ byte_range: range.clone(),
245
+ });
246
+ }
247
+ Event::End(TagEnd::FootnoteDefinition) => {
248
+ in_footnote_def = false;
249
+ if let Some(mut def) = current_footnote_def.take() {
250
+ // Trim a single trailing space left by our " " separator
251
+ // after the last text run—makes the captured body
252
+ // easier to display or diff.
253
+ if def.text.ends_with(' ') {
254
+ def.text.pop();
255
+ }
256
+ footnote_definitions.push(def);
257
+ }
258
+ }
259
+
260
+ // ── Text ──
261
+ Event::Text(t) | Event::Code(t) => {
262
+ if in_metadata_block {
263
+ // Capture the raw YAML frontmatter text;
264
+ // frontmatter is structured config, not content.
265
+ frontmatter = Some(t.to_string());
266
+ } else {
267
+ // Text inside a code block also contributes to the
268
+ // document's character/word totals: code is content,
269
+ // especially for AI/RAG use cases where we want
270
+ // `word_count` to reflect what an embedding model
271
+ // would actually see.
272
+ text_buf.push_str(t);
273
+ text_buf.push(' ');
274
+ if in_code_block {
275
+ if let Some(block) = current_code_block.as_mut() {
276
+ block.source.push_str(t);
277
+ }
278
+ }
279
+ }
280
+ if in_heading {
281
+ current_heading_text.push_str(t);
282
+ }
283
+ if in_image {
284
+ image_alt.push_str(t);
285
+ }
286
+ if in_link {
287
+ link_text.push_str(t);
288
+ }
289
+ if in_footnote_def {
290
+ if let Some(def) = current_footnote_def.as_mut() {
291
+ def.text.push_str(t);
292
+ def.text.push(' ');
293
+ }
294
+ }
295
+ }
296
+ Event::SoftBreak | Event::HardBreak => {
297
+ if !in_code_block {
298
+ text_buf.push(' ');
299
+ }
300
+ }
301
+ _ => {}
302
+ }
303
+ }
304
+
305
+ Stats {
306
+ text_buf,
307
+ heading_count: toc_entries.len(),
308
+ code_blocks,
309
+ images,
310
+ links,
311
+ footnote_definitions,
312
+ headings,
313
+ toc_entries,
314
+ frontmatter,
315
+ }
316
+ }
317
+
318
+ /// Build the `:statistics` hash—scalars only.
319
+ ///
320
+ /// When `full` is true (set by `statistics: true`), emits language
321
+ /// detection, character/word counts, and every `*_count` field.
322
+ /// When false (toc-only mode), emits just `heading_count` so downstream
323
+ /// code that relies on it keeps working without upgrading to full stats.
324
+ pub fn to_statistics_hash(ruby: &Ruby, stats: &Stats, full: bool) -> Result<RHash, Error> {
325
+ let hash = ruby.hash_new();
326
+ hash.aset(ruby.to_symbol("heading_count"), stats.heading_count)?;
327
+
328
+ if full {
329
+ match whatlang::detect(&stats.text_buf) {
330
+ Some(info) => {
331
+ hash.aset(ruby.to_symbol("likely_language"), info.lang().code())?;
332
+ hash.aset(ruby.to_symbol("language_confidence"), info.confidence())?;
333
+ }
334
+ None => {
335
+ hash.aset(ruby.to_symbol("likely_language"), ())?;
336
+ hash.aset(ruby.to_symbol("language_confidence"), ())?;
337
+ }
338
+ }
339
+
340
+ hash.aset(
341
+ ruby.to_symbol("character_count"),
342
+ stats.text_buf.trim().chars().count(),
343
+ )?;
344
+ hash.aset(
345
+ ruby.to_symbol("word_count"),
346
+ stats.text_buf.unicode_words().count(),
347
+ )?;
348
+ hash.aset(ruby.to_symbol("code_block_count"), stats.code_blocks.len())?;
349
+ hash.aset(ruby.to_symbol("image_count"), stats.images.len())?;
350
+ hash.aset(ruby.to_symbol("link_count"), stats.links.len())?;
351
+ hash.aset(
352
+ ruby.to_symbol("footnote_definition_count"),
353
+ stats.footnote_definitions.len(),
354
+ )?;
355
+ }
356
+
357
+ Ok(hash)
358
+ }
359
+
360
+ /// Build the `:extracts` hash. Only keys whose flag is set appear:
361
+ /// callers who opted into one kind aren't charged allocation cost for
362
+ /// the others.
363
+ pub fn to_extracts_hash(ruby: &Ruby, stats: &Stats, flags: ExtractFlags) -> Result<RHash, Error> {
364
+ let hash = ruby.hash_new();
365
+
366
+ if flags.images {
367
+ let arr = ruby.ary_new_capa(stats.images.len());
368
+ for img in &stats.images {
369
+ let h = ruby.hash_new();
370
+ h.aset(ruby.to_symbol("src"), img.src.as_str())?;
371
+ h.aset(ruby.to_symbol("alt"), img.alt.as_str())?;
372
+ h.aset(ruby.to_symbol("title"), img.title.as_str())?;
373
+ h.aset(
374
+ ruby.to_symbol("byte_range"),
375
+ ruby.range_new(img.byte_range.start as i64, img.byte_range.end as i64, true)?,
376
+ )?;
377
+ arr.push(h)?;
378
+ }
379
+ hash.aset(ruby.to_symbol("images"), arr)?;
380
+ }
381
+
382
+ if flags.links {
383
+ let arr = ruby.ary_new_capa(stats.links.len());
384
+ for lnk in &stats.links {
385
+ let h = ruby.hash_new();
386
+ h.aset(ruby.to_symbol("href"), lnk.href.as_str())?;
387
+ h.aset(ruby.to_symbol("text"), lnk.text.as_str())?;
388
+ h.aset(ruby.to_symbol("title"), lnk.title.as_str())?;
389
+ h.aset(
390
+ ruby.to_symbol("byte_range"),
391
+ ruby.range_new(lnk.byte_range.start as i64, lnk.byte_range.end as i64, true)?,
392
+ )?;
393
+ arr.push(h)?;
394
+ }
395
+ hash.aset(ruby.to_symbol("links"), arr)?;
396
+ }
397
+
398
+ if flags.code_blocks {
399
+ let arr = ruby.ary_new_capa(stats.code_blocks.len());
400
+ for block in &stats.code_blocks {
401
+ let h = ruby.hash_new();
402
+ h.aset(ruby.to_symbol("lang"), block.lang.as_str())?;
403
+ h.aset(ruby.to_symbol("source"), block.source.as_str())?;
404
+ h.aset(
405
+ ruby.to_symbol("byte_range"),
406
+ ruby.range_new(
407
+ block.byte_range.start as i64,
408
+ block.byte_range.end as i64,
409
+ true,
410
+ )?,
411
+ )?;
412
+ arr.push(h)?;
413
+ }
414
+ hash.aset(ruby.to_symbol("code_blocks"), arr)?;
415
+ }
416
+
417
+ if flags.headings {
418
+ let arr: RArray = ruby.ary_new_capa(stats.headings.len());
419
+ for entry in &stats.headings {
420
+ let h = ruby.hash_new();
421
+ h.aset(ruby.to_symbol("level"), toc::level_to_u8(entry.level))?;
422
+ h.aset(ruby.to_symbol("text"), entry.text.as_str())?;
423
+ h.aset(ruby.to_symbol("id"), entry.id.as_str())?;
424
+ h.aset(
425
+ ruby.to_symbol("byte_range"),
426
+ ruby.range_new(
427
+ entry.byte_range.start as i64,
428
+ entry.byte_range.end as i64,
429
+ true,
430
+ )?,
431
+ )?;
432
+ arr.push(h)?;
433
+ }
434
+ hash.aset(ruby.to_symbol("headings"), arr)?;
435
+ }
436
+
437
+ if flags.footnote_definitions {
438
+ let arr = ruby.ary_new_capa(stats.footnote_definitions.len());
439
+ for def in &stats.footnote_definitions {
440
+ let h = ruby.hash_new();
441
+ h.aset(ruby.to_symbol("label"), def.label.as_str())?;
442
+ h.aset(ruby.to_symbol("text"), def.text.as_str())?;
443
+ h.aset(
444
+ ruby.to_symbol("byte_range"),
445
+ ruby.range_new(def.byte_range.start as i64, def.byte_range.end as i64, true)?,
446
+ )?;
447
+ arr.push(h)?;
448
+ }
449
+ hash.aset(ruby.to_symbol("footnote_definitions"), arr)?;
450
+ }
451
+
452
+ Ok(hash)
453
+ }
@@ -0,0 +1,226 @@
1
+ //! GFM "Disallowed Raw HTML" extension (spec §6.11).
2
+ //!
3
+ //! Escapes the leading `<` of nine spec-designated tag names so raw
4
+ //! HTML that would change how the document is parsed (or run script)
5
+ //! renders as text instead. Mirrors [comrak](https://github.com/kivikakk/comrak/blob/main/src/html.rs): the
6
+ //! transformation is defined textually by the GFM spec, so we do a
7
+ //! byte scan rather than parse HTML.
8
+
9
+ use pulldown_cmark::{CowStr, Event};
10
+
11
+ const DISALLOWED: &[&[u8]] = &[
12
+ b"title",
13
+ b"textarea",
14
+ b"style",
15
+ b"xmp",
16
+ b"iframe",
17
+ b"noembed",
18
+ b"noframes",
19
+ b"script",
20
+ b"plaintext",
21
+ ];
22
+
23
+ /// Apply the tagfilter to a single event. If the event needs no
24
+ /// rewrite, it's returned unchanged.
25
+ #[inline]
26
+ pub fn apply_event(event: Event<'_>) -> Event<'_> {
27
+ match event {
28
+ Event::Html(s) => match rewrite(&s) {
29
+ Some(out) => Event::Html(CowStr::Boxed(out.into_boxed_str())),
30
+ None => Event::Html(s),
31
+ },
32
+ Event::InlineHtml(s) => match rewrite(&s) {
33
+ Some(out) => Event::InlineHtml(CowStr::Boxed(out.into_boxed_str())),
34
+ None => Event::InlineHtml(s),
35
+ },
36
+ other => other,
37
+ }
38
+ }
39
+
40
+ /// Byte-scan `input` for disallowed tag opens/closes. Returns
41
+ /// `Some(new_string)` when at least one rewrite happened; `None`
42
+ /// when the input is already clean so callers can skip the clone.
43
+ fn rewrite(input: &str) -> Option<String> {
44
+ let bytes = input.as_bytes();
45
+ let mut out: Option<String> = None;
46
+ let mut scan_start = 0;
47
+ let mut i = 0;
48
+
49
+ while i < bytes.len() {
50
+ if bytes[i] == b'<' && is_disallowed_at(bytes, i) {
51
+ let s = out.get_or_insert_with(|| String::with_capacity(input.len() + 12));
52
+ s.push_str(&input[scan_start..i]);
53
+ s.push_str("&lt;");
54
+ scan_start = i + 1;
55
+ }
56
+ i += 1;
57
+ }
58
+
59
+ out.map(|mut s| {
60
+ s.push_str(&input[scan_start..]);
61
+ s
62
+ })
63
+ }
64
+
65
+ /// True when `bytes[pos..]` starts with `<` or `</`, followed by one
66
+ /// of the disallowed tag names, with the next char being a proper
67
+ /// tag-boundary (space, tab, CR, LF, `>`, or `/>`).
68
+ fn is_disallowed_at(bytes: &[u8], pos: usize) -> bool {
69
+ debug_assert_eq!(bytes[pos], b'<');
70
+ let mut i = pos + 1;
71
+ if i >= bytes.len() {
72
+ return false;
73
+ }
74
+ if bytes[i] == b'/' {
75
+ i += 1;
76
+ if i >= bytes.len() {
77
+ return false;
78
+ }
79
+ }
80
+
81
+ for &name in DISALLOWED {
82
+ let end = i + name.len();
83
+ if end > bytes.len() {
84
+ continue;
85
+ }
86
+ if !bytes[i..end].eq_ignore_ascii_case(name) {
87
+ continue;
88
+ }
89
+ // Require a proper tag-boundary so `<scripter>` doesn't match.
90
+ if end == bytes.len() {
91
+ // Ambiguous cut-off: match comrak's conservative default
92
+ // (no escape).
93
+ return false;
94
+ }
95
+ let next = bytes[end];
96
+ if is_space(next) || next == b'>' {
97
+ return true;
98
+ }
99
+ if next == b'/' {
100
+ // Match only when `/>` (spec's self-closing form).
101
+ return end + 1 < bytes.len() && bytes[end + 1] == b'>';
102
+ }
103
+ return false;
104
+ }
105
+ false
106
+ }
107
+
108
+ /// ASCII whitespace as defined by cmark's `isspace`: space, tab, CR, LF.
109
+ /// Matches comrak byte-for-byte.
110
+ #[inline]
111
+ fn is_space(c: u8) -> bool {
112
+ c == b' ' || c == b'\t' || c == b'\r' || c == b'\n'
113
+ }
114
+
115
+ #[cfg(test)]
116
+ mod tests {
117
+ use super::*;
118
+
119
+ fn rw(s: &str) -> String {
120
+ rewrite(s).unwrap_or_else(|| s.to_string())
121
+ }
122
+
123
+ #[test]
124
+ fn escapes_open_tag() {
125
+ assert_eq!(rw("<script>"), "&lt;script>");
126
+ }
127
+
128
+ #[test]
129
+ fn escapes_close_tag() {
130
+ assert_eq!(rw("</script>"), "&lt;/script>");
131
+ }
132
+
133
+ #[test]
134
+ fn escapes_both_in_one_pass() {
135
+ assert_eq!(
136
+ rw("hi <script>alert(1)</script> bye"),
137
+ "hi &lt;script>alert(1)&lt;/script> bye"
138
+ );
139
+ }
140
+
141
+ #[test]
142
+ fn case_insensitive() {
143
+ assert_eq!(rw("<SCRIPT>"), "&lt;SCRIPT>");
144
+ assert_eq!(rw("<ScRiPt>"), "&lt;ScRiPt>");
145
+ assert_eq!(rw("</IFRAME>"), "&lt;/IFRAME>");
146
+ }
147
+
148
+ #[test]
149
+ fn does_not_match_prefix() {
150
+ assert_eq!(rw("<scripter>"), "<scripter>");
151
+ assert_eq!(rw("<styles>"), "<styles>");
152
+ assert_eq!(rw("<titleish>"), "<titleish>");
153
+ }
154
+
155
+ #[test]
156
+ fn escapes_with_attributes() {
157
+ assert_eq!(
158
+ rw(r#"<script src="evil.js">"#),
159
+ r#"&lt;script src="evil.js">"#
160
+ );
161
+ assert_eq!(rw("<iframe\tsrc=\"x\">"), "&lt;iframe\tsrc=\"x\">");
162
+ }
163
+
164
+ #[test]
165
+ fn escapes_self_closing() {
166
+ assert_eq!(rw("<script/>"), "&lt;script/>");
167
+ }
168
+
169
+ #[test]
170
+ fn non_self_closing_slash_not_escaped() {
171
+ // `<script/ok>` is weird; comrak's rule requires `/>` exactly.
172
+ assert_eq!(rw("<script/ok>"), "<script/ok>");
173
+ }
174
+
175
+ #[test]
176
+ fn all_nine_tags_escaped() {
177
+ for name in [
178
+ "title",
179
+ "textarea",
180
+ "style",
181
+ "xmp",
182
+ "iframe",
183
+ "noembed",
184
+ "noframes",
185
+ "script",
186
+ "plaintext",
187
+ ] {
188
+ let input = format!("<{name}>");
189
+ let expected = format!("&lt;{name}>");
190
+ assert_eq!(rw(&input), expected, "tag: {name}");
191
+ }
192
+ }
193
+
194
+ #[test]
195
+ fn no_alloc_when_clean() {
196
+ assert!(rewrite("<b>hi</b>").is_none());
197
+ assert!(rewrite("plain text").is_none());
198
+ assert!(rewrite("").is_none());
199
+ }
200
+
201
+ #[test]
202
+ fn handles_cut_off_at_end() {
203
+ // No trailing boundary char—ambiguous, don't escape.
204
+ assert_eq!(rw("<script"), "<script");
205
+ assert_eq!(rw("</script"), "</script");
206
+ }
207
+
208
+ #[test]
209
+ fn standalone_lt_passes_through() {
210
+ assert_eq!(rw("< script>"), "< script>");
211
+ assert_eq!(rw("a < b"), "a < b");
212
+ }
213
+
214
+ #[test]
215
+ fn already_escaped_not_double_escaped() {
216
+ assert_eq!(rw("&lt;script>"), "&lt;script>");
217
+ }
218
+
219
+ #[test]
220
+ fn matches_comrak_reference_case() {
221
+ // From comrak/src/tests/tagfilter.rs: "hi <xmp> ok\n\n<xmp>\n"
222
+ let input = "hi <xmp> ok\n\n<xmp>\n";
223
+ let expected = "hi &lt;xmp> ok\n\n&lt;xmp>\n";
224
+ assert_eq!(rw(input), expected);
225
+ }
226
+ }