RubyGems - inkmark - Versions diffs - 0.1.0 - Mend

inkmark 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/Cargo.lock +940 -0
data/Cargo.toml +27 -0
data/LICENSE.txt +21 -0
data/NOTICE +16 -0
data/README.md +1166 -0
data/ext/inkmark/Cargo.toml +31 -0
data/ext/inkmark/build.rs +5 -0
data/ext/inkmark/extconf.rb +6 -0
data/ext/inkmark/src/autolink.rs +167 -0
data/ext/inkmark/src/chunks_by_heading.rs +325 -0
data/ext/inkmark/src/chunks_by_size.rs +302 -0
data/ext/inkmark/src/document.rs +411 -0
data/ext/inkmark/src/emoji.rs +197 -0
data/ext/inkmark/src/handler.rs +758 -0
data/ext/inkmark/src/heading.rs +262 -0
data/ext/inkmark/src/highlight.rs +202 -0
data/ext/inkmark/src/image.rs +284 -0
data/ext/inkmark/src/lib.rs +54 -0
data/ext/inkmark/src/link.rs +291 -0
data/ext/inkmark/src/options.rs +231 -0
data/ext/inkmark/src/plain_text.rs +445 -0
data/ext/inkmark/src/scheme_filter.rs +319 -0
data/ext/inkmark/src/stats.rs +453 -0
data/ext/inkmark/src/tag_filter.rs +226 -0
data/ext/inkmark/src/toc.rs +221 -0
data/ext/inkmark/src/truncate.rs +267 -0
data/ext/inkmark/src/url_match.rs +178 -0
data/lib/inkmark/event.rb +342 -0
data/lib/inkmark/native.rb +8 -0
data/lib/inkmark/options.rb +698 -0
data/lib/inkmark/toc.rb +40 -0
data/lib/inkmark/version.rb +6 -0
data/lib/inkmark.rb +711 -0
data/sig/inkmark.rbs +219 -0
metadata +208 -0

data/ext/inkmark/Cargo.toml ADDED Viewed

@@ -0,0 +1,31 @@
+[package]
+name = "inkmark"
+version = "0.1.0"
+edition = "2021"
+authors = ["Yaroslav Markin <yaroslav@markin.net>"]
+license = "MIT"
+publish = false
+[lib]
+crate-type = ["cdylib"]
+[dependencies]
+magnus = { version = "0.8.2" }
+rb-sys = { version = "0.9.126", features = ["stable-api-compiled-fallback"] }
+pulldown-cmark = { version = "0.13", default-features = false, features = ["html", "simd"] }
+pulldown-cmark-escape = "0.11"
+deunicode = "1.6"
+emojis = "0.8"
+whatlang = "0.18"
+unicode-segmentation = "1.12"
+syntect = { version = "5.2", default-features = false, features = ["parsing", "html", "default-themes", "default-syntaxes", "regex-fancy"] }
+linkify = "0.11"
+globset = "0.4"
+url = "2"
+pulldown-cmark-to-cmark = "22"
+[build-dependencies]
+rb-sys-env = "0.2.2"
+[dev-dependencies]
+rb-sys-test-helpers = { version = "0.2.2" }

data/ext/inkmark/build.rs ADDED Viewed

@@ -0,0 +1,5 @@
+pub fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let _ = rb_sys_env::activate()?;
+    Ok(())
+}

data/ext/inkmark/extconf.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+require "mkmf"
+require "rb_sys/mkmf"
+create_rust_makefile("inkmark/inkmark")

data/ext/inkmark/src/autolink.rs ADDED Viewed

@@ -0,0 +1,167 @@
+//! Auto-linking filter for bare URLs and email addresses.
+//!
+//! When enabled, scans `Event::Text` payloads for bare URLs and emails
+//! using the `linkify` crate, and splits them into alternating
+//! `Event::Text` / `Event::Start(Link)` + `Event::Text` + `Event::End(Link)`
+//! sequences. Text inside code blocks and existing links is not touched.
+use linkify::{LinkFinder, LinkKind};
+use pulldown_cmark::{CowStr, Event, LinkType, Tag, TagEnd};
+/// Scan text events for bare URLs/emails and wrap them in link events.
+/// Tracks link and code-block depth so we don't autolink inside existing
+/// links or code blocks.
+pub fn autolink(events: Vec<Event<'_>>) -> Vec<Event<'_>> {
+    let finder = LinkFinder::new();
+    let mut out: Vec<Event<'_>> = Vec::with_capacity(events.len());
+    let mut link_depth: usize = 0;
+    let mut code_depth: usize = 0;
+    for event in events {
+        match &event {
+            Event::Start(Tag::Link { .. }) => link_depth += 1,
+            Event::End(TagEnd::Link) => link_depth = link_depth.saturating_sub(1),
+            Event::Start(Tag::CodeBlock(_)) => code_depth += 1,
+            Event::End(TagEnd::CodeBlock) => code_depth = code_depth.saturating_sub(1),
+            _ => {}
+        }
+        // Only process Text events outside links and code blocks.
+        let dominated = link_depth > 0 || code_depth > 0;
+        let is_text = matches!(&event, Event::Text(_));
+        if !is_text || dominated {
+            out.push(event);
+            continue;
+        }
+        // Extract the text and scan for links.
+        if let Event::Text(text) = event {
+            let spans: Vec<_> = finder.spans(&text).collect();
+            // Fast path: no links found—push original event unchanged.
+            if spans.iter().all(|s| s.kind().is_none()) {
+                out.push(Event::Text(text));
+                continue;
+            }
+            // Split the text into alternating plain / link spans.
+            for span in spans {
+                let fragment = &text[span.start()..span.end()];
+                match span.kind() {
+                    Some(LinkKind::Url) => {
+                        let url = CowStr::Boxed(fragment.to_string().into_boxed_str());
+                        let display = CowStr::Boxed(fragment.to_string().into_boxed_str());
+                        out.push(Event::Start(Tag::Link {
+                            link_type: LinkType::Autolink,
+                            dest_url: url,
+                            title: CowStr::Borrowed(""),
+                            id: CowStr::Borrowed(""),
+                        }));
+                        out.push(Event::Text(display));
+                        out.push(Event::End(TagEnd::Link));
+                    }
+                    Some(LinkKind::Email) => {
+                        // pulldown-cmark's HTML writer adds "mailto:" for
+                        // LinkType::Email, so we pass just the address.
+                        let addr = CowStr::Boxed(fragment.to_string().into_boxed_str());
+                        let display = CowStr::Boxed(fragment.to_string().into_boxed_str());
+                        out.push(Event::Start(Tag::Link {
+                            link_type: LinkType::Email,
+                            dest_url: addr,
+                            title: CowStr::Borrowed(""),
+                            id: CowStr::Borrowed(""),
+                        }));
+                        out.push(Event::Text(display));
+                        out.push(Event::End(TagEnd::Link));
+                    }
+                    Some(_) | None => {
+                        // Plain text segment—no link.
+                        if !fragment.is_empty() {
+                            out.push(Event::Text(CowStr::Boxed(
+                                fragment.to_string().into_boxed_str(),
+                            )));
+                        }
+                    }
+                }
+            }
+        }
+    }
+    out
+}
+#[cfg(test)]
+mod tests {
+    use super::autolink;
+    use pulldown_cmark::{CowStr, Event, LinkType, Tag, TagEnd};
+    #[test]
+    fn bare_url_becomes_link() {
+        let events = vec![Event::Text(CowStr::Borrowed(
+            "Visit https://example.net today",
+        ))];
+        let out = autolink(events);
+        // Should produce: Text("Visit ") + Start(Link) + Text("https://example.com") + End(Link) + Text(" today")
+        assert!(out.len() >= 5, "expected split events, got {}", out.len());
+        let has_link = out
+            .iter()
+            .any(|e| matches!(e, Event::Start(Tag::Link { .. })));
+        assert!(has_link, "no link event found");
+    }
+    #[test]
+    fn email_becomes_email_link() {
+        let events = vec![Event::Text(CowStr::Borrowed("Contact user@example.com"))];
+        let out = autolink(events);
+        // pulldown-cmark's HTML writer adds "mailto:" for LinkType::Email,
+        // so we only store the bare address in dest_url.
+        let has_email = out.iter().any(|e| match e {
+            Event::Start(Tag::Link {
+                link_type: LinkType::Email,
+                dest_url,
+                ..
+            }) => dest_url.as_ref() == "user@example.com",
+            _ => false,
+        });
+        assert!(has_email, "no email link found in {out:?}");
+    }
+    #[test]
+    fn text_without_urls_unchanged() {
+        let events = vec![Event::Text(CowStr::Borrowed("just plain text"))];
+        let out = autolink(events);
+        assert_eq!(out.len(), 1);
+        assert!(matches!(&out[0], Event::Text(t) if t.as_ref() == "just plain text"));
+    }
+    #[test]
+    fn skips_inside_existing_links() {
+        let events = vec![
+            Event::Start(Tag::Link {
+                link_type: LinkType::Inline,
+                dest_url: CowStr::Borrowed("https://example.net"),
+                title: CowStr::Borrowed(""),
+                id: CowStr::Borrowed(""),
+            }),
+            Event::Text(CowStr::Borrowed("https://example.net")),
+            Event::End(TagEnd::Link),
+        ];
+        let out = autolink(events);
+        // Should be unchanged—3 events, no extra links added.
+        assert_eq!(out.len(), 3);
+    }
+    #[test]
+    fn skips_inside_code_blocks() {
+        let events = vec![
+            Event::Start(Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
+                CowStr::Borrowed(""),
+            ))),
+            Event::Text(CowStr::Borrowed("https://example.net")),
+            Event::End(TagEnd::CodeBlock),
+        ];
+        let out = autolink(events);
+        assert_eq!(out.len(), 3);
+    }
+}

data/ext/inkmark/src/chunks_by_heading.rs ADDED Viewed

@@ -0,0 +1,325 @@
+//! Heading-based section extraction for LLM / RAG pipelines.
+//!
+//! Splits a document into hierarchical sections by heading.
+//! Each section's `content` is filter-applied Markdown: emoji
+//! expanded, URLs autolinked, host/scheme allowlists applied,
+//! then serialized back through `pulldown-cmark-to-cmark`.
+//!
+//! Designed as a first-stage chunking primitive for
+//! `chunk → embed → retrieve` pipelines: feed a document in, get an
+//! ordered array of heading-led sections out. The Ruby side wraps this
+//! with an optional `heading:` filter (String or Regexp).
+//!
+//! Heading Start/End pairs survive the filter pipeline intact —
+//! emoji/autolink rewrites happen *inside* a heading's text events,
+//! but the bracketing tags stay in place—so post-filter heading-
+//! position scanning is coherent.
+use magnus::{Error, RArray, RHash, Ruby};
+use pulldown_cmark::{Event, HeadingLevel, Parser, Tag, TagEnd};
+use unicode_segmentation::UnicodeSegmentation;
+use crate::document::apply_filters;
+use crate::heading::{self, SlugDeduplicator};
+use crate::options::build_options;
+use crate::toc;
+use crate::truncate::{self, TruncateParams};
+pub fn native_chunks_by_heading(
+    ruby: &Ruby,
+    source: String,
+    opts_hash: RHash,
+) -> Result<RArray, Error> {
+    // The Ruby side merges the optional `truncate:` kwarg into the
+    // opts hash under the `:truncate` key before calling us. We pull
+    // it out here; the rest of `build_options` ignores it.
+    let truncate_params: Option<TruncateParams> = {
+        let nested: Option<RHash> = opts_hash.lookup(ruby.to_symbol("truncate"))?;
+        match nested {
+            Some(h) => Some(truncate::parse_params(ruby, h)?),
+            None => None,
+        }
+    };
+    let (cm_opts, flags) = build_options(ruby, opts_hash)?;
+    // Parse + run the full filter pipeline, same as `to_markdown`.
+    let events: Vec<Event> = Parser::new_ext(&source, cm_opts).collect();
+    let events = apply_filters(events, &flags);
+    let boundaries = find_heading_boundaries(&events);
+    let result = ruby.ary_new();
+    // Preamble: events before the first heading, or the whole doc
+    // when there are no headings at all. Emitted as an entry with
+    // `heading: nil, level: 0, id: nil`. Skipped entirely when there
+    // is no non-empty content before the first heading.
+    let with_counts = flags.statistics;
+    let preamble_end = boundaries.first().map(|b| b.start).unwrap_or(events.len());
+    if preamble_end > 0 {
+        let preamble_events = &events[0..preamble_end];
+        if !is_empty_content(preamble_events) {
+            result.push(build_preamble_hash(
+                ruby,
+                preamble_events,
+                with_counts,
+                truncate_params.as_ref(),
+            )?)?;
+        }
+    }
+    // One entry per heading. Section end is the position of the
+    // next heading with level <= current level (or end of events).
+    //
+    // `ancestors` tracks the heading stack so we can attach a
+    // breadcrumb (root → immediate-parent) to each section. At
+    // each boundary we pop any ancestors whose level is >= the
+    // current boundary's level (those aren't parents), then record
+    // the remaining stack as this section's breadcrumb, then push
+    // the current heading for its own subsections' use.
+    let mut dedup = SlugDeduplicator::new();
+    let mut ancestors: Vec<(u8, String)> = Vec::new();
+    for (i, boundary) in boundaries.iter().enumerate() {
+        let section_end = find_section_end(&boundaries, i, events.len());
+        let level = toc::level_to_u8(boundary.level);
+        while ancestors.last().is_some_and(|(l, _)| *l >= level) {
+            ancestors.pop();
+        }
+        let breadcrumb: Vec<&str> = ancestors.iter().map(|(_, t)| t.as_str()).collect();
+        let heading_text = collect_inline_text(&events[(boundary.start + 1)..boundary.end]);
+        let hash = build_section_hash(
+            ruby,
+            &events,
+            boundary,
+            section_end,
+            &heading_text,
+            &breadcrumb,
+            &mut dedup,
+            with_counts,
+            truncate_params.as_ref(),
+        )?;
+        result.push(hash)?;
+        ancestors.push((level, heading_text));
+    }
+    Ok(result)
+}
+/// A discovered `Start(Heading) / End(Heading)` pair in the filtered
+/// event stream.
+struct HeadingBoundary {
+    start: usize,
+    end: usize,
+    level: HeadingLevel,
+}
+fn find_heading_boundaries(events: &[Event<'_>]) -> Vec<HeadingBoundary> {
+    let mut boundaries = Vec::new();
+    let mut i = 0;
+    while i < events.len() {
+        if let Event::Start(Tag::Heading { level, .. }) = &events[i] {
+            let lvl = *level;
+            // CommonMark disallows headings inside headings, but carry a
+            // depth counter so we stay correct if pulldown-cmark ever
+            // permits them.
+            let mut depth = 1usize;
+            let mut j = i + 1;
+            while j < events.len() {
+                match &events[j] {
+                    Event::Start(Tag::Heading { .. }) => depth += 1,
+                    Event::End(TagEnd::Heading(_)) => {
+                        depth -= 1;
+                        if depth == 0 {
+                            break;
+                        }
+                    }
+                    _ => {}
+                }
+                j += 1;
+            }
+            boundaries.push(HeadingBoundary {
+                start: i,
+                end: j,
+                level: lvl,
+            });
+            i = j + 1;
+        } else {
+            i += 1;
+        }
+    }
+    boundaries
+}
+/// Section i ends at the first subsequent heading with level <=
+/// current. Headings with a strictly greater level are subsections:
+/// they belong to the current section's content too.
+fn find_section_end(boundaries: &[HeadingBoundary], i: usize, events_len: usize) -> usize {
+    let current = toc::level_to_u8(boundaries[i].level);
+    boundaries[(i + 1)..]
+        .iter()
+        .find(|b| toc::level_to_u8(b.level) <= current)
+        .map(|b| b.start)
+        .unwrap_or(events_len)
+}
+fn build_preamble_hash(
+    ruby: &Ruby,
+    events: &[Event<'_>],
+    with_counts: bool,
+    truncate_params: Option<&TruncateParams>,
+) -> Result<RHash, Error> {
+    let hash = ruby.hash_new();
+    hash.aset(ruby.to_symbol("heading"), ())?;
+    hash.aset(ruby.to_symbol("level"), 0u8)?;
+    hash.aset(ruby.to_symbol("id"), ())?;
+    // Preamble has no ancestors; empty array keeps the shape uniform
+    // with proper sections so callers can treat every entry alike.
+    hash.aset(ruby.to_symbol("breadcrumb"), ruby.ary_new_capa(0))?;
+    let content = match truncate_params {
+        Some(params) => truncate::truncate_events(events, params),
+        None => render_markdown(events),
+    };
+    if with_counts {
+        let (chars, words) = count_post_truncate(events, truncate_params, &content);
+        hash.aset(ruby.to_symbol("character_count"), chars)?;
+        hash.aset(ruby.to_symbol("word_count"), words)?;
+    }
+    hash.aset(ruby.to_symbol("content"), content)?;
+    Ok(hash)
+}
+fn build_section_hash(
+    ruby: &Ruby,
+    events: &[Event<'_>],
+    boundary: &HeadingBoundary,
+    section_end: usize,
+    heading_text: &str,
+    breadcrumb: &[&str],
+    dedup: &mut SlugDeduplicator,
+    with_counts: bool,
+    truncate_params: Option<&TruncateParams>,
+) -> Result<RHash, Error> {
+    // Slug is the deduplicated slugify of the (filter-applied) heading
+    // text, matching the ids `heading_ids` / `toc` would emit for the
+    // same document.
+    let base = heading::slugify(heading_text);
+    let id = if base.is_empty() {
+        String::new()
+    } else {
+        dedup.deduplicate(base)
+    };
+    // Content = events after End(Heading) up to the next section or
+    // end of document. Re-serialized through cmark_write.
+    let content_events = &events[(boundary.end + 1)..section_end];
+    let hash = ruby.hash_new();
+    hash.aset(ruby.to_symbol("heading"), heading_text)?;
+    hash.aset(ruby.to_symbol("level"), toc::level_to_u8(boundary.level))?;
+    if id.is_empty() {
+        hash.aset(ruby.to_symbol("id"), ())?;
+    } else {
+        hash.aset(ruby.to_symbol("id"), id)?;
+    }
+    let breadcrumb_arr = ruby.ary_new_capa(breadcrumb.len());
+    for text in breadcrumb {
+        breadcrumb_arr.push(*text)?;
+    }
+    hash.aset(ruby.to_symbol("breadcrumb"), breadcrumb_arr)?;
+    let content = match truncate_params {
+        Some(params) => truncate::truncate_events(content_events, params),
+        None => render_markdown(content_events),
+    };
+    if with_counts {
+        let (chars, words) = count_post_truncate(content_events, truncate_params, &content);
+        hash.aset(ruby.to_symbol("character_count"), chars)?;
+        hash.aset(ruby.to_symbol("word_count"), words)?;
+    }
+    hash.aset(ruby.to_symbol("content"), content)?;
+    Ok(hash)
+}
+/// Return (character_count, word_count) for a section.
+///
+/// Without truncation: counts come from the original event stream's
+/// Text/Code events.
+///
+/// With truncation: reparse the truncated Markdown and count from its
+/// events.
+fn count_post_truncate(
+    original_events: &[Event<'_>],
+    truncate_params: Option<&TruncateParams>,
+    truncated_content: &str,
+) -> (usize, usize) {
+    if truncate_params.is_none() {
+        return count_text(original_events);
+    }
+    let events: Vec<Event> = Parser::new_ext(
+        truncated_content,
+        pulldown_cmark::Options::ENABLE_GFM
+            | pulldown_cmark::Options::ENABLE_TABLES
+            | pulldown_cmark::Options::ENABLE_STRIKETHROUGH
+            | pulldown_cmark::Options::ENABLE_TASKLISTS
+            | pulldown_cmark::Options::ENABLE_FOOTNOTES,
+    )
+    .collect();
+    count_text(&events)
+}
+/// Count characters (after trimming) and unicode words in a section's
+/// Text/Code event stream. Code-block contents are included: matches
+/// document-level {stats::collect} semantics and reflects what an
+/// embedding model would actually consume.
+fn count_text(events: &[Event<'_>]) -> (usize, usize) {
+    let mut buf = String::new();
+    for event in events {
+        match event {
+            Event::Text(t) | Event::Code(t) => {
+                buf.push_str(t);
+                buf.push(' ');
+            }
+            Event::SoftBreak | Event::HardBreak => buf.push(' '),
+            _ => {}
+        }
+    }
+    let chars = buf.trim().chars().count();
+    let words = buf.unicode_words().count();
+    (chars, words)
+}
+fn render_markdown(events: &[Event<'_>]) -> String {
+    let mut buf = String::new();
+    pulldown_cmark_to_cmark::cmark(events.iter().cloned(), &mut buf)
+        .expect("markdown serialization failed");
+    buf
+}
+fn collect_inline_text(events: &[Event<'_>]) -> String {
+    let mut out = String::new();
+    for event in events {
+        match event {
+            Event::Text(t) | Event::Code(t) => out.push_str(t),
+            _ => {}
+        }
+    }
+    out
+}
+/// A preamble (or whole-doc when there are no headings) is meaningful
+/// only when it contains actual content: text, code, or raw HTML.
+/// Whitespace-only event streams produce an empty preamble entry that
+/// would just add noise.
+fn is_empty_content(events: &[Event<'_>]) -> bool {
+    !events.iter().any(|e| {
+        matches!(
+            e,
+            Event::Text(_)
+                | Event::Code(_)
+                | Event::Html(_)
+                | Event::InlineHtml(_)
+                | Event::InlineMath(_)
+                | Event::DisplayMath(_)
+        )
+    })
+}