RubyGems - inkmark - Versions diffs - 0.1.0 - Mend

inkmark 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/Cargo.lock +940 -0
data/Cargo.toml +27 -0
data/LICENSE.txt +21 -0
data/NOTICE +16 -0
data/README.md +1166 -0
data/ext/inkmark/Cargo.toml +31 -0
data/ext/inkmark/build.rs +5 -0
data/ext/inkmark/extconf.rb +6 -0
data/ext/inkmark/src/autolink.rs +167 -0
data/ext/inkmark/src/chunks_by_heading.rs +325 -0
data/ext/inkmark/src/chunks_by_size.rs +302 -0
data/ext/inkmark/src/document.rs +411 -0
data/ext/inkmark/src/emoji.rs +197 -0
data/ext/inkmark/src/handler.rs +758 -0
data/ext/inkmark/src/heading.rs +262 -0
data/ext/inkmark/src/highlight.rs +202 -0
data/ext/inkmark/src/image.rs +284 -0
data/ext/inkmark/src/lib.rs +54 -0
data/ext/inkmark/src/link.rs +291 -0
data/ext/inkmark/src/options.rs +231 -0
data/ext/inkmark/src/plain_text.rs +445 -0
data/ext/inkmark/src/scheme_filter.rs +319 -0
data/ext/inkmark/src/stats.rs +453 -0
data/ext/inkmark/src/tag_filter.rs +226 -0
data/ext/inkmark/src/toc.rs +221 -0
data/ext/inkmark/src/truncate.rs +267 -0
data/ext/inkmark/src/url_match.rs +178 -0
data/lib/inkmark/event.rb +342 -0
data/lib/inkmark/native.rb +8 -0
data/lib/inkmark/options.rb +698 -0
data/lib/inkmark/toc.rb +40 -0
data/lib/inkmark/version.rb +6 -0
data/lib/inkmark.rb +711 -0
data/sig/inkmark.rbs +219 -0
metadata +208 -0

data/ext/inkmark/src/stats.rs ADDED Viewed

@@ -0,0 +1,453 @@
+//! Document statistics and table-of-contents collector.
+//!
+//! Walks a slice of `(Event, byte_range)` tuples once (before filters) and collects:
+//! - Text buffer → character count, word count, language detection
+//! - Heading entries → heading count, TOC (markdown + HTML), heading extract
+//! - Code block count + raw source extract
+//! - Image and link extract metadata
+//! - Footnote definition count + body extract
+//!
+//! Byte ranges come from pulldown-cmark's `OffsetIter`: the Start tag's
+//! range spans the whole source element (e.g. the entire
+//! `` ```ruby\n...\n``` ``
+//! for a fenced code block). The caller (`document.rs`) parses with
+//! `Parser::new_ext(...).into_offset_iter()` and hands us the result.
+//!
+//! The collector is the single source of truth for the full-render path.
+//! Two independent Ruby-side knobs consume its output:
+//! - `statistics: true` => scalar counts and language
+//! detection (`to_statistics_hash`)
+//! - `extract: {...}` => filtered arrays of structured
+//! records (`to_extracts_hash`)
+use std::ops::Range;
+use magnus::{Error, RArray, RHash, Ruby};
+use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Tag, TagEnd};
+use unicode_segmentation::UnicodeSegmentation;
+use crate::heading::{self, SlugDeduplicator};
+use crate::toc::{self, TocEntry};
+pub struct ImageInfo {
+    pub src: String,
+    pub alt: String,
+    pub title: String,
+    pub byte_range: Range<usize>,
+}
+pub struct LinkInfo {
+    pub href: String,
+    pub text: String,
+    pub title: String,
+    pub byte_range: Range<usize>,
+}
+/// A single fenced or indented code block captured before any filter
+/// runs: the `source` is pulldown-cmark's unmodified content, suitable
+/// for passing to an external highlighter.
+/// `lang` is the info string on a fence (e.g. `"ruby"`); indented code
+/// blocks carry the empty string, matching the handler API.
+pub struct CodeBlockInfo {
+    pub lang: String,
+    pub source: String,
+    pub byte_range: Range<usize>,
+}
+/// A footnote definition `[^label]: body`. `text` is the plain-text body:
+/// emphasis, links, and inline formatting are flattened to their text
+/// content, matching how `ImageInfo.alt` and `LinkInfo.text` are captured.
+pub struct FootnoteDefInfo {
+    pub label: String,
+    pub text: String,
+    pub byte_range: Range<usize>,
+}
+/// Heading record for the `extract[:headings]` projection. Parallel to
+/// `toc::TocEntry` but adds a byte range and uses the `id`/extract
+/// vocabulary. We push to both during the walk: it's one allocation per
+/// heading, and keeps the TOC data type free of byte-range baggage that
+/// its renderer ignores.
+pub struct HeadingInfo {
+    pub level: HeadingLevel,
+    pub text: String,
+    pub id: String,
+    pub byte_range: Range<usize>,
+}
+pub struct Stats {
+    pub text_buf: String,
+    pub heading_count: usize,
+    pub code_blocks: Vec<CodeBlockInfo>,
+    pub images: Vec<ImageInfo>,
+    pub links: Vec<LinkInfo>,
+    pub footnote_definitions: Vec<FootnoteDefInfo>,
+    pub headings: Vec<HeadingInfo>,
+    pub toc_entries: Vec<TocEntry>,
+    pub frontmatter: Option<String>,
+}
+/// Which extract arrays to serialize into the Ruby-side `:extracts` hash.
+/// Flags map 1:1 to the Ruby-facing `extract: { ... }` hash keys.
+#[derive(Default, Clone, Copy)]
+pub struct ExtractFlags {
+    pub images: bool,
+    pub links: bool,
+    pub code_blocks: bool,
+    pub headings: bool,
+    pub footnote_definitions: bool,
+}
+impl ExtractFlags {
+    pub fn any(&self) -> bool {
+        self.images || self.links || self.code_blocks || self.headings || self.footnote_definitions
+    }
+}
+/// Walk events and collect all statistics + TOC entries in one pass.
+/// Call BEFORE filters so we measure original content. Each event
+/// arrives paired with the byte range of its source span: the Start
+/// tag's range is what gets attached to the corresponding extract
+/// record.
+pub fn collect(events: &[(Event<'_>, Range<usize>)]) -> Stats {
+    let mut text_buf = String::new();
+    let mut code_blocks: Vec<CodeBlockInfo> = Vec::new();
+    let mut images: Vec<ImageInfo> = Vec::new();
+    let mut links: Vec<LinkInfo> = Vec::new();
+    let mut footnote_definitions: Vec<FootnoteDefInfo> = Vec::new();
+    let mut frontmatter: Option<String> = None;
+    let mut in_metadata_block = false;
+    let mut in_code_block = false;
+    let mut in_image = false;
+    let mut in_link = false;
+    let mut in_footnote_def = false;
+    let mut image_alt = String::new();
+    let mut link_text = String::new();
+    let mut current_code_block: Option<CodeBlockInfo> = None;
+    let mut current_image: Option<ImageInfo> = None;
+    let mut current_link: Option<LinkInfo> = None;
+    let mut current_footnote_def: Option<FootnoteDefInfo> = None;
+    let mut toc_entries: Vec<TocEntry> = Vec::new();
+    let mut headings: Vec<HeadingInfo> = Vec::new();
+    let mut dedup = SlugDeduplicator::new();
+    let mut in_heading = false;
+    let mut current_heading_level = HeadingLevel::H1;
+    let mut current_heading_text = String::new();
+    let mut current_heading_range: Range<usize> = 0..0;
+    for (event, range) in events {
+        match event {
+            // Headings
+            Event::Start(Tag::Heading { level, .. }) => {
+                in_heading = true;
+                current_heading_level = *level;
+                current_heading_text.clear();
+                current_heading_range = range.clone();
+            }
+            Event::End(TagEnd::Heading(_)) if in_heading => {
+                in_heading = false;
+                let base = heading::slugify(&current_heading_text);
+                if !base.is_empty() {
+                    let slug = dedup.deduplicate(base);
+                    toc_entries.push(TocEntry {
+                        level: current_heading_level,
+                        text: current_heading_text.clone(),
+                        slug: slug.clone(),
+                    });
+                    headings.push(HeadingInfo {
+                        level: current_heading_level,
+                        text: current_heading_text.clone(),
+                        id: slug,
+                        byte_range: current_heading_range.clone(),
+                    });
+                }
+            }
+            // Frontmatter
+            Event::Start(Tag::MetadataBlock(_)) => {
+                in_metadata_block = true;
+            }
+            Event::End(TagEnd::MetadataBlock(_)) => {
+                in_metadata_block = false;
+            }
+            // Code blocks
+            Event::Start(Tag::CodeBlock(kind)) => {
+                let lang = match kind {
+                    CodeBlockKind::Fenced(lang) => lang.to_string(),
+                    CodeBlockKind::Indented => String::new(),
+                };
+                current_code_block = Some(CodeBlockInfo {
+                    lang,
+                    source: String::new(),
+                    byte_range: range.clone(),
+                });
+                in_code_block = true;
+            }
+            Event::End(TagEnd::CodeBlock) => {
+                in_code_block = false;
+                if let Some(block) = current_code_block.take() {
+                    code_blocks.push(block);
+                }
+            }
+            // Images
+            Event::Start(Tag::Image {
+                dest_url, title, ..
+            }) => {
+                in_image = true;
+                image_alt.clear();
+                current_image = Some(ImageInfo {
+                    src: dest_url.to_string(),
+                    alt: String::new(),
+                    title: title.to_string(),
+                    byte_range: range.clone(),
+                });
+            }
+            Event::End(TagEnd::Image) => {
+                in_image = false;
+                if let Some(mut img) = current_image.take() {
+                    img.alt = image_alt.clone();
+                    images.push(img);
+                }
+            }
+            // Links
+            Event::Start(Tag::Link {
+                dest_url, title, ..
+            }) => {
+                in_link = true;
+                link_text.clear();
+                current_link = Some(LinkInfo {
+                    href: dest_url.to_string(),
+                    text: String::new(),
+                    title: title.to_string(),
+                    byte_range: range.clone(),
+                });
+            }
+            Event::End(TagEnd::Link) => {
+                in_link = false;
+                if let Some(mut lnk) = current_link.take() {
+                    lnk.text = link_text.clone();
+                    links.push(lnk);
+                }
+            }
+            // Footnote definitions
+            Event::Start(Tag::FootnoteDefinition(label)) => {
+                in_footnote_def = true;
+                current_footnote_def = Some(FootnoteDefInfo {
+                    label: label.to_string(),
+                    text: String::new(),
+                    byte_range: range.clone(),
+                });
+            }
+            Event::End(TagEnd::FootnoteDefinition) => {
+                in_footnote_def = false;
+                if let Some(mut def) = current_footnote_def.take() {
+                    // Trim a single trailing space left by our " " separator
+                    // after the last text run—makes the captured body
+                    // easier to display or diff.
+                    if def.text.ends_with(' ') {
+                        def.text.pop();
+                    }
+                    footnote_definitions.push(def);
+                }
+            }
+            // ── Text ──
+            Event::Text(t) | Event::Code(t) => {
+                if in_metadata_block {
+                    // Capture the raw YAML frontmatter text;
+                    // frontmatter is structured config, not content.
+                    frontmatter = Some(t.to_string());
+                } else {
+                    // Text inside a code block also contributes to the
+                    // document's character/word totals: code is content,
+                    // especially for AI/RAG use cases where we want
+                    // `word_count` to reflect what an embedding model
+                    // would actually see.
+                    text_buf.push_str(t);
+                    text_buf.push(' ');
+                    if in_code_block {
+                        if let Some(block) = current_code_block.as_mut() {
+                            block.source.push_str(t);
+                        }
+                    }
+                }
+                if in_heading {
+                    current_heading_text.push_str(t);
+                }
+                if in_image {
+                    image_alt.push_str(t);
+                }
+                if in_link {
+                    link_text.push_str(t);
+                }
+                if in_footnote_def {
+                    if let Some(def) = current_footnote_def.as_mut() {
+                        def.text.push_str(t);
+                        def.text.push(' ');
+                    }
+                }
+            }
+            Event::SoftBreak | Event::HardBreak => {
+                if !in_code_block {
+                    text_buf.push(' ');
+                }
+            }
+            _ => {}
+        }
+    }
+    Stats {
+        text_buf,
+        heading_count: toc_entries.len(),
+        code_blocks,
+        images,
+        links,
+        footnote_definitions,
+        headings,
+        toc_entries,
+        frontmatter,
+    }
+}
+/// Build the `:statistics` hash—scalars only.
+///
+/// When `full` is true (set by `statistics: true`), emits language
+/// detection, character/word counts, and every `*_count` field.
+/// When false (toc-only mode), emits just `heading_count` so downstream
+/// code that relies on it keeps working without upgrading to full stats.
+pub fn to_statistics_hash(ruby: &Ruby, stats: &Stats, full: bool) -> Result<RHash, Error> {
+    let hash = ruby.hash_new();
+    hash.aset(ruby.to_symbol("heading_count"), stats.heading_count)?;
+    if full {
+        match whatlang::detect(&stats.text_buf) {
+            Some(info) => {
+                hash.aset(ruby.to_symbol("likely_language"), info.lang().code())?;
+                hash.aset(ruby.to_symbol("language_confidence"), info.confidence())?;
+            }
+            None => {
+                hash.aset(ruby.to_symbol("likely_language"), ())?;
+                hash.aset(ruby.to_symbol("language_confidence"), ())?;
+            }
+        }
+        hash.aset(
+            ruby.to_symbol("character_count"),
+            stats.text_buf.trim().chars().count(),
+        )?;
+        hash.aset(
+            ruby.to_symbol("word_count"),
+            stats.text_buf.unicode_words().count(),
+        )?;
+        hash.aset(ruby.to_symbol("code_block_count"), stats.code_blocks.len())?;
+        hash.aset(ruby.to_symbol("image_count"), stats.images.len())?;
+        hash.aset(ruby.to_symbol("link_count"), stats.links.len())?;
+        hash.aset(
+            ruby.to_symbol("footnote_definition_count"),
+            stats.footnote_definitions.len(),
+        )?;
+    }
+    Ok(hash)
+}
+/// Build the `:extracts` hash. Only keys whose flag is set appear:
+/// callers who opted into one kind aren't charged allocation cost for
+/// the others.
+pub fn to_extracts_hash(ruby: &Ruby, stats: &Stats, flags: ExtractFlags) -> Result<RHash, Error> {
+    let hash = ruby.hash_new();
+    if flags.images {
+        let arr = ruby.ary_new_capa(stats.images.len());
+        for img in &stats.images {
+            let h = ruby.hash_new();
+            h.aset(ruby.to_symbol("src"), img.src.as_str())?;
+            h.aset(ruby.to_symbol("alt"), img.alt.as_str())?;
+            h.aset(ruby.to_symbol("title"), img.title.as_str())?;
+            h.aset(
+                ruby.to_symbol("byte_range"),
+                ruby.range_new(img.byte_range.start as i64, img.byte_range.end as i64, true)?,
+            )?;
+            arr.push(h)?;
+        }
+        hash.aset(ruby.to_symbol("images"), arr)?;
+    }
+    if flags.links {
+        let arr = ruby.ary_new_capa(stats.links.len());
+        for lnk in &stats.links {
+            let h = ruby.hash_new();
+            h.aset(ruby.to_symbol("href"), lnk.href.as_str())?;
+            h.aset(ruby.to_symbol("text"), lnk.text.as_str())?;
+            h.aset(ruby.to_symbol("title"), lnk.title.as_str())?;
+            h.aset(
+                ruby.to_symbol("byte_range"),
+                ruby.range_new(lnk.byte_range.start as i64, lnk.byte_range.end as i64, true)?,
+            )?;
+            arr.push(h)?;
+        }
+        hash.aset(ruby.to_symbol("links"), arr)?;
+    }
+    if flags.code_blocks {
+        let arr = ruby.ary_new_capa(stats.code_blocks.len());
+        for block in &stats.code_blocks {
+            let h = ruby.hash_new();
+            h.aset(ruby.to_symbol("lang"), block.lang.as_str())?;
+            h.aset(ruby.to_symbol("source"), block.source.as_str())?;
+            h.aset(
+                ruby.to_symbol("byte_range"),
+                ruby.range_new(
+                    block.byte_range.start as i64,
+                    block.byte_range.end as i64,
+                    true,
+                )?,
+            )?;
+            arr.push(h)?;
+        }
+        hash.aset(ruby.to_symbol("code_blocks"), arr)?;
+    }
+    if flags.headings {
+        let arr: RArray = ruby.ary_new_capa(stats.headings.len());
+        for entry in &stats.headings {
+            let h = ruby.hash_new();
+            h.aset(ruby.to_symbol("level"), toc::level_to_u8(entry.level))?;
+            h.aset(ruby.to_symbol("text"), entry.text.as_str())?;
+            h.aset(ruby.to_symbol("id"), entry.id.as_str())?;
+            h.aset(
+                ruby.to_symbol("byte_range"),
+                ruby.range_new(
+                    entry.byte_range.start as i64,
+                    entry.byte_range.end as i64,
+                    true,
+                )?,
+            )?;
+            arr.push(h)?;
+        }
+        hash.aset(ruby.to_symbol("headings"), arr)?;
+    }
+    if flags.footnote_definitions {
+        let arr = ruby.ary_new_capa(stats.footnote_definitions.len());
+        for def in &stats.footnote_definitions {
+            let h = ruby.hash_new();
+            h.aset(ruby.to_symbol("label"), def.label.as_str())?;
+            h.aset(ruby.to_symbol("text"), def.text.as_str())?;
+            h.aset(
+                ruby.to_symbol("byte_range"),
+                ruby.range_new(def.byte_range.start as i64, def.byte_range.end as i64, true)?,
+            )?;
+            arr.push(h)?;
+        }
+        hash.aset(ruby.to_symbol("footnote_definitions"), arr)?;
+    }
+    Ok(hash)
+}

data/ext/inkmark/src/tag_filter.rs ADDED Viewed

@@ -0,0 +1,226 @@
+//! GFM "Disallowed Raw HTML" extension (spec §6.11).
+//!
+//! Escapes the leading `<` of nine spec-designated tag names so raw
+//! HTML that would change how the document is parsed (or run script)
+//! renders as text instead. Mirrors [comrak](https://github.com/kivikakk/comrak/blob/main/src/html.rs): the
+//! transformation is defined textually by the GFM spec, so we do a
+//! byte scan rather than parse HTML.
+use pulldown_cmark::{CowStr, Event};
+const DISALLOWED: &[&[u8]] = &[
+    b"title",
+    b"textarea",
+    b"style",
+    b"xmp",
+    b"iframe",
+    b"noembed",
+    b"noframes",
+    b"script",
+    b"plaintext",
+];
+/// Apply the tagfilter to a single event. If the event needs no
+/// rewrite, it's returned unchanged.
+#[inline]
+pub fn apply_event(event: Event<'_>) -> Event<'_> {
+    match event {
+        Event::Html(s) => match rewrite(&s) {
+            Some(out) => Event::Html(CowStr::Boxed(out.into_boxed_str())),
+            None => Event::Html(s),
+        },
+        Event::InlineHtml(s) => match rewrite(&s) {
+            Some(out) => Event::InlineHtml(CowStr::Boxed(out.into_boxed_str())),
+            None => Event::InlineHtml(s),
+        },
+        other => other,
+    }
+}
+/// Byte-scan `input` for disallowed tag opens/closes. Returns
+/// `Some(new_string)` when at least one rewrite happened; `None`
+/// when the input is already clean so callers can skip the clone.
+fn rewrite(input: &str) -> Option<String> {
+    let bytes = input.as_bytes();
+    let mut out: Option<String> = None;
+    let mut scan_start = 0;
+    let mut i = 0;
+    while i < bytes.len() {
+        if bytes[i] == b'<' && is_disallowed_at(bytes, i) {
+            let s = out.get_or_insert_with(|| String::with_capacity(input.len() + 12));
+            s.push_str(&input[scan_start..i]);
+            s.push_str("&lt;");
+            scan_start = i + 1;
+        }
+        i += 1;
+    }
+    out.map(|mut s| {
+        s.push_str(&input[scan_start..]);
+        s
+    })
+}
+/// True when `bytes[pos..]` starts with `<` or `</`, followed by one
+/// of the disallowed tag names, with the next char being a proper
+/// tag-boundary (space, tab, CR, LF, `>`, or `/>`).
+fn is_disallowed_at(bytes: &[u8], pos: usize) -> bool {
+    debug_assert_eq!(bytes[pos], b'<');
+    let mut i = pos + 1;
+    if i >= bytes.len() {
+        return false;
+    }
+    if bytes[i] == b'/' {
+        i += 1;
+        if i >= bytes.len() {
+            return false;
+        }
+    }
+    for &name in DISALLOWED {
+        let end = i + name.len();
+        if end > bytes.len() {
+            continue;
+        }
+        if !bytes[i..end].eq_ignore_ascii_case(name) {
+            continue;
+        }
+        // Require a proper tag-boundary so `<scripter>` doesn't match.
+        if end == bytes.len() {
+            // Ambiguous cut-off: match comrak's conservative default
+            // (no escape).
+            return false;
+        }
+        let next = bytes[end];
+        if is_space(next) || next == b'>' {
+            return true;
+        }
+        if next == b'/' {
+            // Match only when `/>` (spec's self-closing form).
+            return end + 1 < bytes.len() && bytes[end + 1] == b'>';
+        }
+        return false;
+    }
+    false
+}
+/// ASCII whitespace as defined by cmark's `isspace`: space, tab, CR, LF.
+/// Matches comrak byte-for-byte.
+#[inline]
+fn is_space(c: u8) -> bool {
+    c == b' ' || c == b'\t' || c == b'\r' || c == b'\n'
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    fn rw(s: &str) -> String {
+        rewrite(s).unwrap_or_else(|| s.to_string())
+    }
+    #[test]
+    fn escapes_open_tag() {
+        assert_eq!(rw("<script>"), "&lt;script>");
+    }
+    #[test]
+    fn escapes_close_tag() {
+        assert_eq!(rw("</script>"), "&lt;/script>");
+    }
+    #[test]
+    fn escapes_both_in_one_pass() {
+        assert_eq!(
+            rw("hi <script>alert(1)</script> bye"),
+            "hi &lt;script>alert(1)&lt;/script> bye"
+        );
+    }
+    #[test]
+    fn case_insensitive() {
+        assert_eq!(rw("<SCRIPT>"), "&lt;SCRIPT>");
+        assert_eq!(rw("<ScRiPt>"), "&lt;ScRiPt>");
+        assert_eq!(rw("</IFRAME>"), "&lt;/IFRAME>");
+    }
+    #[test]
+    fn does_not_match_prefix() {
+        assert_eq!(rw("<scripter>"), "<scripter>");
+        assert_eq!(rw("<styles>"), "<styles>");
+        assert_eq!(rw("<titleish>"), "<titleish>");
+    }
+    #[test]
+    fn escapes_with_attributes() {
+        assert_eq!(
+            rw(r#"<script src="evil.js">"#),
+            r#"&lt;script src="evil.js">"#
+        );
+        assert_eq!(rw("<iframe\tsrc=\"x\">"), "&lt;iframe\tsrc=\"x\">");
+    }
+    #[test]
+    fn escapes_self_closing() {
+        assert_eq!(rw("<script/>"), "&lt;script/>");
+    }
+    #[test]
+    fn non_self_closing_slash_not_escaped() {
+        // `<script/ok>` is weird; comrak's rule requires `/>` exactly.
+        assert_eq!(rw("<script/ok>"), "<script/ok>");
+    }
+    #[test]
+    fn all_nine_tags_escaped() {
+        for name in [
+            "title",
+            "textarea",
+            "style",
+            "xmp",
+            "iframe",
+            "noembed",
+            "noframes",
+            "script",
+            "plaintext",
+        ] {
+            let input = format!("<{name}>");
+            let expected = format!("&lt;{name}>");
+            assert_eq!(rw(&input), expected, "tag: {name}");
+        }
+    }
+    #[test]
+    fn no_alloc_when_clean() {
+        assert!(rewrite("<b>hi</b>").is_none());
+        assert!(rewrite("plain text").is_none());
+        assert!(rewrite("").is_none());
+    }
+    #[test]
+    fn handles_cut_off_at_end() {
+        // No trailing boundary char—ambiguous, don't escape.
+        assert_eq!(rw("<script"), "<script");
+        assert_eq!(rw("</script"), "</script");
+    }
+    #[test]
+    fn standalone_lt_passes_through() {
+        assert_eq!(rw("< script>"), "< script>");
+        assert_eq!(rw("a < b"), "a < b");
+    }
+    #[test]
+    fn already_escaped_not_double_escaped() {
+        assert_eq!(rw("&lt;script>"), "&lt;script>");
+    }
+    #[test]
+    fn matches_comrak_reference_case() {
+        // From comrak/src/tests/tagfilter.rs: "hi <xmp> ok\n\n<xmp>\n"
+        let input = "hi <xmp> ok\n\n<xmp>\n";
+        let expected = "hi &lt;xmp> ok\n\n&lt;xmp>\n";
+        assert_eq!(rw(input), expected);
+    }
+}