RubyGems - tomos - Versions diffs - 0.1.0 → 0.4.0 - Mend

tomos 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7c74f281acc538880271bcdb2face54257d5f9bcf052545ed2fc1dd14a3fd14d
-  data.tar.gz: 6e5ad11384eb2a1a201abb3c9ecf0a2cc3fb4fbebcb06bd3894ab339a5b20c05
+  metadata.gz: 929f65b3b0e88a33b57073c3471e9b814774f224beeea97bf0a86ee5f7c019ac
+  data.tar.gz: b47cfae4cda36d40642d637dbc16c57656ab6eb59db44259fce3b0456cc05192
 SHA512:
-  metadata.gz: 2b760a8366624a206bac3263b00ec5e741ac95af3d4e1022f3ac92ba5607bd9d70cf9b867c3fba5779faee89eab77f81f1d8675770d0ac16e02f84f0bf04a50d
-  data.tar.gz: 1320c81c79e91005731657752acb25f2b8e4449bb22e413489b3de4a137826ee85d1d08c836818eae3146003441388963cf789c46682219cefb1ab490284a5a0
+  metadata.gz: 46f87d29b1a95ce78898bc3beee868e6c0626d5114d90ec264cac62f186660f3faabc4ea6a6f3adfe447e516f4ce3482e630db7dde2b9b0e0482ccb9295d5c30
+  data.tar.gz: cd5951e757827d15e16664cfc22b6aadfd8a5e85f43f7456df569a990ab526b1478591c12c0aa5e80ec937f184d49237b670c549d8047a4401afa9c4f0efc2c9

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,13 @@
+# Changelog
+## 0.4.0
+### Added
+- `Tomos::Text.count_tokens(text, model:)` and `Tomos::Markdown.count_tokens(text, model:)` class methods for counting tokens without constructing a splitter.
+- `#count_tokens(text)` instance method on both `Tomos::Text` and `Tomos::Markdown`, reusing the BPE tokenizer already resolved at construction time.
+- Both forms return `0` for empty input and raise `ArgumentError` for unrecognized models, consistent with the existing splitter API.
+### Deprecated
+Using `Tomos::Text.new(model:, capacity: <very_large_number>)` and reading `chunks(text).first.token_count` as a workaround for token counting is no longer necessary. Use `count_tokens` instead.

data/Cargo.lock CHANGED Viewed

@@ -89,6 +89,15 @@ version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
 [[package]]
 name = "bstr"
 version = "1.12.1"
@@ -135,6 +144,25 @@ dependencies = [
  "libm",
 ]
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
 [[package]]
 name = "derive_utils"
 version = "0.15.1"
@@ -146,6 +174,16 @@ dependencies = [
  "syn",
 ]
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
 [[package]]
 name = "displaydoc"
 version = "0.2.5"
@@ -174,6 +212,16 @@ dependencies = [
  "regex-syntax",
 ]
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
 [[package]]
 name = "getrandom"
 version = "0.3.4"
@@ -541,6 +589,17 @@ dependencies = [
  "syn",
 ]
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
 [[package]]
 name = "shell-words"
 version = "1.1.1"
@@ -673,10 +732,17 @@ version = "0.1.0"
 dependencies = [
  "magnus",
  "rb-sys",
+ "sha2",
  "text-splitter",
  "tiktoken-rs",
 ]
+[[package]]
+name = "typenum"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 [[package]]
 name = "unicase"
 version = "2.9.0"

data/README.md ADDED Viewed

@@ -0,0 +1,105 @@
+# tomos
+Token-aware text chunking for RAG pipelines, powered by Rust.
+Tomos wraps the [`text-splitter`](https://github.com/benbrandt/text-splitter) Rust crate with [tiktoken](https://github.com/zurawiki/tiktoken-rs) tokenization and exposes two splitter classes to Ruby: `Tomos::Text` for plain text and `Tomos::Markdown` for Markdown documents. Each chunk carries its token count, byte position, and a SHA-256 content fingerprint.
+## Installation
+```ruby
+gem "tomos"
+```
+Because tomos includes a native Rust extension, you'll need a Rust toolchain installed. The gem compiles on `bundle install`.
+## Usage
+### Splitting text into chunks
+```ruby
+splitter = Tomos::Text.new(model: "gpt-4", capacity: 512)
+chunks = splitter.chunks("A long document goes here...")
+chunks.each do |chunk|
+  chunk.text         # => String  — the chunk content
+  chunk.token_count  # => Integer — tokens in this chunk
+  chunk.byte_offset  # => Integer — start position in the original string
+  chunk.byte_length  # => Integer — byte length of the chunk
+  chunk.chunk_id     # => String  — 64-char SHA-256 hex digest
+end
+```
+The `capacity` is the maximum number of tokens per chunk. An optional `overlap` keyword shares tokens between adjacent chunks, which helps preserve context at boundaries:
+```ruby
+splitter = Tomos::Text.new(model: "gpt-4", capacity: 512, overlap: 50)
+```
+### Splitting Markdown
+`Tomos::Markdown` is Markdown-structure-aware — it respects headers, lists, and code fences when deciding where to split:
+```ruby
+splitter = Tomos::Markdown.new(model: "gpt-4", capacity: 512)
+chunks = splitter.chunks(File.read("document.md"))
+```
+Note: tokenization is over the raw input string regardless of splitter type; `Markdown` differs only in where it chooses split boundaries.
+### Counting tokens
+Count tokens directly without constructing a splitter:
+```ruby
+# Class method — resolves the tokenizer fresh each call
+Tomos::Text.count_tokens("Hello, world!", model: "gpt-4")
+# => 4
+Tomos::Markdown.count_tokens("# Hello\n\nWorld", model: "gpt-4")
+# => 4
+```
+If you already have a splitter instance, the instance method reuses its already-resolved tokenizer:
+```ruby
+splitter = Tomos::Text.new(model: "gpt-4", capacity: 512)
+splitter.count_tokens("Hello, world!")
+# => 4
+```
+Both forms return `0` for empty input and raise `ArgumentError` for unrecognized model names.
+### Supported models
+Any model name recognized by tiktoken, including:
+- `gpt-4`, `gpt-4o`, `gpt-4.1`, `gpt-5`
+- `o1`, `o3`, `o4` and their versioned variants (e.g. `o1-mini`, `gpt-4o-2024-05-13`)
+- `gpt-3.5-turbo`
+- `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`
+Unrecognized model names raise `ArgumentError`.
+## Chunk metadata
+Each `Tomos::Chunk` exposes:
+| Method | Type | Description |
+|---|---|---|
+| `text` | `String` | The chunk content |
+| `token_count` | `Integer` | Number of tokens in this chunk |
+| `byte_offset` | `Integer` | Start byte position in the source string |
+| `byte_length` | `Integer` | Byte length of the chunk |
+| `chunk_id` | `String` | 64-char lowercase SHA-256 hex digest of the chunk text |
+The byte metadata lets you map a chunk back to its exact position in the source:
+```ruby
+source[chunk.byte_offset, chunk.byte_length] == chunk.text # => true
+```
+The `chunk_id` is deterministic — the same text always produces the same ID, regardless of model, capacity, or overlap.
+## License
+MIT

data/ext/tomos/Cargo.toml CHANGED Viewed

@@ -12,5 +12,6 @@ crate-type = ["cdylib"]
 [dependencies]
 text-splitter = { version = "0.29", features = ["tiktoken-rs", "markdown"] }
 tiktoken-rs = "0.9"
+sha2 = "0.10"
 magnus = "0.8"
 rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }

data/ext/tomos/src/lib.rs CHANGED Viewed

@@ -1,21 +1,36 @@
+//! Tomos – Ruby-native text chunking backed by [`text_splitter`] and [`tiktoken_rs`].
+//!
+//! Exposes `Tomos::Text` and `Tomos::Markdown` to Ruby via Magnus. Each wraps a
+//! token-aware splitter that produces `Tomos::Chunk` objects carrying metadata
+//! (token count, byte offset/length, SHA-256 chunk ID).
 use magnus::{function, method, prelude::*, Error, Ruby};
+use sha2::{Digest, Sha256};
 use text_splitter::{ChunkConfig, MarkdownSplitter, TextSplitter};
 use tiktoken_rs::get_bpe_from_model;
 type RbResult<T> = Result<T, Error>;
+/// Resolve a tiktoken BPE tokenizer by model name, mapping unknown models to
+/// a Ruby `ArgumentError`.
+fn resolve_bpe(ruby: &Ruby, model: &str) -> RbResult<tiktoken_rs::CoreBPE> {
+    get_bpe_from_model(model).map_err(|e| {
+        Error::new(
+            ruby.exception_arg_error(),
+            format!("unrecognized tiktoken model '{model}': {e}"),
+        )
+    })
+}
+/// Build a [`ChunkConfig`] with the given token `capacity` and `overlap`,
+/// sized by the BPE tokenizer for `model`.
 fn build_chunk_config(
     ruby: &Ruby,
     model: &str,
     capacity: usize,
     overlap: usize,
 ) -> RbResult<ChunkConfig<tiktoken_rs::CoreBPE>> {
-    let bpe = get_bpe_from_model(model).map_err(|e| {
-        Error::new(
-            ruby.exception_arg_error(),
-            format!("unrecognized tiktoken model '{model}': {e}"),
-        )
-    })?;
+    let bpe = resolve_bpe(ruby, model)?;
     let config = ChunkConfig::new(capacity)
         .with_overlap(overlap)
@@ -30,60 +45,147 @@ fn build_chunk_config(
     Ok(config)
 }
-/// Splits unstructured text along Unicode boundaries (sentences, words,
-/// grapheme clusters). Works well on transcripts and content with no
-/// paragraph or section structure.
-#[magnus::wrap(class = "Tomos::Text", free_immediately, size)]
-struct RbText {
-    splitter: TextSplitter<tiktoken_rs::CoreBPE>,
+/// Produce a hex-encoded SHA-256 digest of `text` for use as a chunk identifier.
+fn chunk_id(text: &str) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(text.as_bytes());
+    format!("{:x}", hasher.finalize())
 }
-impl RbText {
-    fn new(ruby: &Ruby, model: String, capacity: usize, overlap: usize) -> RbResult<Self> {
-        let config = build_chunk_config(ruby, &model, capacity, overlap)?;
-        Ok(Self {
-            splitter: TextSplitter::new(config),
-        })
+/// Collect splitter output into a Ruby array of [`RbChunk`] objects.
+fn collect_chunks<'a>(
+    ruby: &Ruby,
+    iter: impl Iterator<Item = (usize, &'a str)>,
+    bpe: &tiktoken_rs::CoreBPE,
+) -> magnus::RArray {
+    let ary = ruby.ary_new();
+    for (offset, slice) in iter {
+        let _ = ary.push(RbChunk {
+            text: slice.to_owned(),
+            token_count: bpe.encode_ordinary(slice).len(),
+            byte_offset: offset,
+            byte_length: slice.len(),
+            chunk_id: chunk_id(slice),
+        });
     }
+    ary
+}
-    fn chunks(&self, text: String) -> Vec<String> {
-        self.splitter.chunks(&text).map(str::to_owned).collect()
+/// Count tokens in `text` using the given BPE tokenizer.
+fn count_tokens_with_bpe(bpe: &tiktoken_rs::CoreBPE, text: &str) -> usize {
+    if text.is_empty() {
+        return 0;
     }
+    bpe.encode_ordinary(text).len()
+}
+/// Standalone token-counting entry point exposed as a class-level method on
+/// both `Tomos::Text` and `Tomos::Markdown`.
+fn count_tokens(ruby: &Ruby, text: String, model: String) -> RbResult<usize> {
+    let bpe = resolve_bpe(ruby, &model)?;
+    Ok(count_tokens_with_bpe(&bpe, &text))
 }
-/// Splits CommonMark/GFM markdown along structural boundaries (headings,
-/// code fences, list items, block elements) in addition to the Unicode
-/// levels that `RbText` uses. Degrades gracefully to plain-text splitting
-/// when the input contains no markdown structure.
-#[magnus::wrap(class = "Tomos::Markdown", free_immediately, size)]
-struct RbMarkdown {
-    splitter: MarkdownSplitter<tiktoken_rs::CoreBPE>,
+/// A single chunk produced by a splitter, exposed to Ruby as `Tomos::Chunk`.
+#[derive(Debug)]
+#[magnus::wrap(class = "Tomos::Chunk", free_immediately, size)]
+struct RbChunk {
+    text: String,
+    token_count: usize,
+    byte_offset: usize,
+    byte_length: usize,
+    chunk_id: String,
 }
-impl RbMarkdown {
-    fn new(ruby: &Ruby, model: String, capacity: usize, overlap: usize) -> RbResult<Self> {
-        let config = build_chunk_config(ruby, &model, capacity, overlap)?;
-        Ok(Self {
-            splitter: MarkdownSplitter::new(config),
-        })
+impl RbChunk {
+    fn text(&self) -> &str {
+        &self.text
+    }
+    fn token_count(&self) -> usize {
+        self.token_count
+    }
+    fn byte_offset(&self) -> usize {
+        self.byte_offset
+    }
+    fn byte_length(&self) -> usize {
+        self.byte_length
     }
-    fn chunks(&self, text: String) -> Vec<String> {
-        self.splitter.chunks(&text).map(str::to_owned).collect()
+    fn chunk_id(&self) -> &str {
+        &self.chunk_id
     }
 }
+/// Generates a splitter wrapper struct with `new`, `chunks`, and `count_tokens`
+/// methods. Both `RbText` and `RbMarkdown` share identical structure and logic,
+/// differing only in the underlying splitter type.
+macro_rules! define_splitter {
+    ($name:ident, $class:literal, $splitter:ty) => {
+        #[magnus::wrap(class = $class, free_immediately, size)]
+        struct $name {
+            splitter: $splitter,
+            bpe: tiktoken_rs::CoreBPE,
+        }
+        impl $name {
+            fn new(
+                ruby: &Ruby,
+                model: String,
+                capacity: usize,
+                overlap: usize,
+            ) -> RbResult<Self> {
+                let config = build_chunk_config(ruby, &model, capacity, overlap)?;
+                let bpe = resolve_bpe(ruby, &model)?;
+                Ok(Self {
+                    splitter: <$splitter>::new(config),
+                    bpe,
+                })
+            }
+            fn chunks(&self, text: String) -> magnus::RArray {
+                let ruby = Ruby::get().expect("chunks called outside Ruby thread");
+                collect_chunks(&ruby, self.splitter.chunk_indices(&text), &self.bpe)
+            }
+            fn count_tokens(&self, text: String) -> usize {
+                count_tokens_with_bpe(&self.bpe, &text)
+            }
+        }
+    };
+}
+define_splitter!(RbText, "Tomos::Text", TextSplitter<tiktoken_rs::CoreBPE>);
+define_splitter!(
+    RbMarkdown,
+    "Tomos::Markdown",
+    MarkdownSplitter<tiktoken_rs::CoreBPE>
+);
 #[magnus::init]
 fn init(ruby: &Ruby) -> RbResult<()> {
     let module = ruby.define_module("Tomos")?;
+    let chunk = module.define_class("Chunk", ruby.class_object())?;
+    chunk.define_method("text", method!(RbChunk::text, 0))?;
+    chunk.define_method("token_count", method!(RbChunk::token_count, 0))?;
+    chunk.define_method("byte_offset", method!(RbChunk::byte_offset, 0))?;
+    chunk.define_method("byte_length", method!(RbChunk::byte_length, 0))?;
+    chunk.define_method("chunk_id", method!(RbChunk::chunk_id, 0))?;
     let text = module.define_class("Text", ruby.class_object())?;
-    text.define_singleton_method("new", function!(RbText::new, 3))?;
+    text.define_singleton_method("_new", function!(RbText::new, 3))?;
     text.define_method("chunks", method!(RbText::chunks, 1))?;
+    text.define_method("count_tokens", method!(RbText::count_tokens, 1))?;
+    text.define_singleton_method("_count_tokens", function!(count_tokens, 2))?;
     let markdown = module.define_class("Markdown", ruby.class_object())?;
-    markdown.define_singleton_method("new", function!(RbMarkdown::new, 3))?;
+    markdown.define_singleton_method("_new", function!(RbMarkdown::new, 3))?;
     markdown.define_method("chunks", method!(RbMarkdown::chunks, 1))?;
+    markdown.define_method("count_tokens", method!(RbMarkdown::count_tokens, 1))?;
+    markdown.define_singleton_method("_count_tokens", function!(count_tokens, 2))?;
     Ok(())
 }

data/lib/tomos/tomos.bundle CHANGED Viewed

Binary file

data/lib/tomos/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Tomos
-  VERSION = "0.1.0"
+  VERSION = "0.4.0"
 end

data/lib/tomos.rb CHANGED Viewed

@@ -2,3 +2,25 @@
 require "tomos/version"
 require "tomos/tomos"
+module Tomos
+  class Text
+    def self.new(model:, capacity:, overlap: 0)
+      _new(model, capacity, overlap)
+    end
+    def self.count_tokens(text, model:)
+      _count_tokens(text, model)
+    end
+  end
+  class Markdown
+    def self.new(model:, capacity:, overlap: 0)
+      _new(model, capacity, overlap)
+    end
+    def self.count_tokens(text, model:)
+      _count_tokens(text, model)
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tomos
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Dan Frenette
@@ -30,8 +30,10 @@ extensions:
 - ext/tomos/extconf.rb
 extra_rdoc_files: []
 files:
+- CHANGELOG.md
 - Cargo.lock
 - Cargo.toml
+- README.md
 - ext/tomos/Cargo.toml
 - ext/tomos/extconf.rb
 - ext/tomos/src/lib.rs