tomos 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7c74f281acc538880271bcdb2face54257d5f9bcf052545ed2fc1dd14a3fd14d
4
- data.tar.gz: 6e5ad11384eb2a1a201abb3c9ecf0a2cc3fb4fbebcb06bd3894ab339a5b20c05
3
+ metadata.gz: 929f65b3b0e88a33b57073c3471e9b814774f224beeea97bf0a86ee5f7c019ac
4
+ data.tar.gz: b47cfae4cda36d40642d637dbc16c57656ab6eb59db44259fce3b0456cc05192
5
5
  SHA512:
6
- metadata.gz: 2b760a8366624a206bac3263b00ec5e741ac95af3d4e1022f3ac92ba5607bd9d70cf9b867c3fba5779faee89eab77f81f1d8675770d0ac16e02f84f0bf04a50d
7
- data.tar.gz: 1320c81c79e91005731657752acb25f2b8e4449bb22e413489b3de4a137826ee85d1d08c836818eae3146003441388963cf789c46682219cefb1ab490284a5a0
6
+ metadata.gz: 46f87d29b1a95ce78898bc3beee868e6c0626d5114d90ec264cac62f186660f3faabc4ea6a6f3adfe447e516f4ce3482e630db7dde2b9b0e0482ccb9295d5c30
7
+ data.tar.gz: cd5951e757827d15e16664cfc22b6aadfd8a5e85f43f7456df569a990ab526b1478591c12c0aa5e80ec937f184d49237b670c549d8047a4401afa9c4f0efc2c9
data/CHANGELOG.md ADDED
@@ -0,0 +1,13 @@
1
+ # Changelog
2
+
3
+ ## 0.4.0
4
+
5
+ ### Added
6
+
7
+ - `Tomos::Text.count_tokens(text, model:)` and `Tomos::Markdown.count_tokens(text, model:)` class methods for counting tokens without constructing a splitter.
8
+ - `#count_tokens(text)` instance method on both `Tomos::Text` and `Tomos::Markdown`, reusing the BPE tokenizer already resolved at construction time.
9
+ - Both forms return `0` for empty input and raise `ArgumentError` for unrecognized models, consistent with the existing splitter API.
10
+
11
+ ### Deprecated
12
+
13
+ Using `Tomos::Text.new(model:, capacity: <very_large_number>)` and reading `chunks(text).first.token_count` as a workaround for token counting is no longer necessary. Use `count_tokens` instead.
data/Cargo.lock CHANGED
@@ -89,6 +89,15 @@ version = "2.11.0"
89
89
  source = "registry+https://github.com/rust-lang/crates.io-index"
90
90
  checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
91
91
 
92
+ [[package]]
93
+ name = "block-buffer"
94
+ version = "0.10.4"
95
+ source = "registry+https://github.com/rust-lang/crates.io-index"
96
+ checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
97
+ dependencies = [
98
+ "generic-array",
99
+ ]
100
+
92
101
  [[package]]
93
102
  name = "bstr"
94
103
  version = "1.12.1"
@@ -135,6 +144,25 @@ dependencies = [
135
144
  "libm",
136
145
  ]
137
146
 
147
+ [[package]]
148
+ name = "cpufeatures"
149
+ version = "0.2.17"
150
+ source = "registry+https://github.com/rust-lang/crates.io-index"
151
+ checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
152
+ dependencies = [
153
+ "libc",
154
+ ]
155
+
156
+ [[package]]
157
+ name = "crypto-common"
158
+ version = "0.1.7"
159
+ source = "registry+https://github.com/rust-lang/crates.io-index"
160
+ checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
161
+ dependencies = [
162
+ "generic-array",
163
+ "typenum",
164
+ ]
165
+
138
166
  [[package]]
139
167
  name = "derive_utils"
140
168
  version = "0.15.1"
@@ -146,6 +174,16 @@ dependencies = [
146
174
  "syn",
147
175
  ]
148
176
 
177
+ [[package]]
178
+ name = "digest"
179
+ version = "0.10.7"
180
+ source = "registry+https://github.com/rust-lang/crates.io-index"
181
+ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
182
+ dependencies = [
183
+ "block-buffer",
184
+ "crypto-common",
185
+ ]
186
+
149
187
  [[package]]
150
188
  name = "displaydoc"
151
189
  version = "0.2.5"
@@ -174,6 +212,16 @@ dependencies = [
174
212
  "regex-syntax",
175
213
  ]
176
214
 
215
+ [[package]]
216
+ name = "generic-array"
217
+ version = "0.14.7"
218
+ source = "registry+https://github.com/rust-lang/crates.io-index"
219
+ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
220
+ dependencies = [
221
+ "typenum",
222
+ "version_check",
223
+ ]
224
+
177
225
  [[package]]
178
226
  name = "getrandom"
179
227
  version = "0.3.4"
@@ -541,6 +589,17 @@ dependencies = [
541
589
  "syn",
542
590
  ]
543
591
 
592
+ [[package]]
593
+ name = "sha2"
594
+ version = "0.10.9"
595
+ source = "registry+https://github.com/rust-lang/crates.io-index"
596
+ checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
597
+ dependencies = [
598
+ "cfg-if",
599
+ "cpufeatures",
600
+ "digest",
601
+ ]
602
+
544
603
  [[package]]
545
604
  name = "shell-words"
546
605
  version = "1.1.1"
@@ -673,10 +732,17 @@ version = "0.1.0"
673
732
  dependencies = [
674
733
  "magnus",
675
734
  "rb-sys",
735
+ "sha2",
676
736
  "text-splitter",
677
737
  "tiktoken-rs",
678
738
  ]
679
739
 
740
+ [[package]]
741
+ name = "typenum"
742
+ version = "1.19.0"
743
+ source = "registry+https://github.com/rust-lang/crates.io-index"
744
+ checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
745
+
680
746
  [[package]]
681
747
  name = "unicase"
682
748
  version = "2.9.0"
data/README.md ADDED
@@ -0,0 +1,105 @@
1
+ # tomos
2
+
3
+ Token-aware text chunking for RAG pipelines, powered by Rust.
4
+
5
+ Tomos wraps the [`text-splitter`](https://github.com/benbrandt/text-splitter) Rust crate with [tiktoken](https://github.com/zurawiki/tiktoken-rs) tokenization and exposes two splitter classes to Ruby: `Tomos::Text` for plain text and `Tomos::Markdown` for Markdown documents. Each chunk carries its token count, byte position, and a SHA-256 content fingerprint.
6
+
7
+ ## Installation
8
+
9
+ ```ruby
10
+ gem "tomos"
11
+ ```
12
+
13
+ Because tomos includes a native Rust extension, you'll need a Rust toolchain installed. The gem compiles on `bundle install`.
14
+
15
+ ## Usage
16
+
17
+ ### Splitting text into chunks
18
+
19
+ ```ruby
20
+ splitter = Tomos::Text.new(model: "gpt-4", capacity: 512)
21
+ chunks = splitter.chunks("A long document goes here...")
22
+
23
+ chunks.each do |chunk|
24
+ chunk.text # => String — the chunk content
25
+ chunk.token_count # => Integer — tokens in this chunk
26
+ chunk.byte_offset # => Integer — start position in the original string
27
+ chunk.byte_length # => Integer — byte length of the chunk
28
+ chunk.chunk_id # => String — 64-char SHA-256 hex digest
29
+ end
30
+ ```
31
+
32
+ The `capacity` is the maximum number of tokens per chunk. An optional `overlap` keyword shares tokens between adjacent chunks, which helps preserve context at boundaries:
33
+
34
+ ```ruby
35
+ splitter = Tomos::Text.new(model: "gpt-4", capacity: 512, overlap: 50)
36
+ ```
37
+
38
+ ### Splitting Markdown
39
+
40
+ `Tomos::Markdown` is Markdown-structure-aware — it respects headers, lists, and code fences when deciding where to split:
41
+
42
+ ```ruby
43
+ splitter = Tomos::Markdown.new(model: "gpt-4", capacity: 512)
44
+ chunks = splitter.chunks(File.read("document.md"))
45
+ ```
46
+
47
+ Note: tokenization is over the raw input string regardless of splitter type; `Markdown` differs only in where it chooses split boundaries.
48
+
49
+ ### Counting tokens
50
+
51
+ Count tokens directly without constructing a splitter:
52
+
53
+ ```ruby
54
+ # Class method — resolves the tokenizer fresh each call
55
+ Tomos::Text.count_tokens("Hello, world!", model: "gpt-4")
56
+ # => 4
57
+
58
+ Tomos::Markdown.count_tokens("# Hello\n\nWorld", model: "gpt-4")
59
+ # => 4
60
+ ```
61
+
62
+ If you already have a splitter instance, the instance method reuses its already-resolved tokenizer:
63
+
64
+ ```ruby
65
+ splitter = Tomos::Text.new(model: "gpt-4", capacity: 512)
66
+ splitter.count_tokens("Hello, world!")
67
+ # => 4
68
+ ```
69
+
70
+ Both forms return `0` for empty input and raise `ArgumentError` for unrecognized model names.
71
+
72
+ ### Supported models
73
+
74
+ Any model name recognized by tiktoken, including:
75
+
76
+ - `gpt-4`, `gpt-4o`, `gpt-4.1`, `gpt-5`
77
+ - `o1`, `o3`, `o4` and their versioned variants (e.g. `o1-mini`, `gpt-4o-2024-05-13`)
78
+ - `gpt-3.5-turbo`
79
+ - `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`
80
+
81
+ Unrecognized model names raise `ArgumentError`.
82
+
83
+ ## Chunk metadata
84
+
85
+ Each `Tomos::Chunk` exposes:
86
+
87
+ | Method | Type | Description |
88
+ |---|---|---|
89
+ | `text` | `String` | The chunk content |
90
+ | `token_count` | `Integer` | Number of tokens in this chunk |
91
+ | `byte_offset` | `Integer` | Start byte position in the source string |
92
+ | `byte_length` | `Integer` | Byte length of the chunk |
93
+ | `chunk_id` | `String` | 64-char lowercase SHA-256 hex digest of the chunk text |
94
+
95
+ The byte metadata lets you map a chunk back to its exact position in the source:
96
+
97
+ ```ruby
98
+ source[chunk.byte_offset, chunk.byte_length] == chunk.text # => true
99
+ ```
100
+
101
+ The `chunk_id` is deterministic — the same text always produces the same ID, regardless of model, capacity, or overlap.
102
+
103
+ ## License
104
+
105
+ MIT
data/ext/tomos/Cargo.toml CHANGED
@@ -12,5 +12,6 @@ crate-type = ["cdylib"]
12
12
  [dependencies]
13
13
  text-splitter = { version = "0.29", features = ["tiktoken-rs", "markdown"] }
14
14
  tiktoken-rs = "0.9"
15
+ sha2 = "0.10"
15
16
  magnus = "0.8"
16
17
  rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
data/ext/tomos/src/lib.rs CHANGED
@@ -1,21 +1,36 @@
1
+ //! Tomos – Ruby-native text chunking backed by [`text_splitter`] and [`tiktoken_rs`].
2
+ //!
3
+ //! Exposes `Tomos::Text` and `Tomos::Markdown` to Ruby via Magnus. Each wraps a
4
+ //! token-aware splitter that produces `Tomos::Chunk` objects carrying metadata
5
+ //! (token count, byte offset/length, SHA-256 chunk ID).
6
+
1
7
  use magnus::{function, method, prelude::*, Error, Ruby};
8
+ use sha2::{Digest, Sha256};
2
9
  use text_splitter::{ChunkConfig, MarkdownSplitter, TextSplitter};
3
10
  use tiktoken_rs::get_bpe_from_model;
4
11
 
5
12
  type RbResult<T> = Result<T, Error>;
6
13
 
14
+ /// Resolve a tiktoken BPE tokenizer by model name, mapping unknown models to
15
+ /// a Ruby `ArgumentError`.
16
+ fn resolve_bpe(ruby: &Ruby, model: &str) -> RbResult<tiktoken_rs::CoreBPE> {
17
+ get_bpe_from_model(model).map_err(|e| {
18
+ Error::new(
19
+ ruby.exception_arg_error(),
20
+ format!("unrecognized tiktoken model '{model}': {e}"),
21
+ )
22
+ })
23
+ }
24
+
25
+ /// Build a [`ChunkConfig`] with the given token `capacity` and `overlap`,
26
+ /// sized by the BPE tokenizer for `model`.
7
27
  fn build_chunk_config(
8
28
  ruby: &Ruby,
9
29
  model: &str,
10
30
  capacity: usize,
11
31
  overlap: usize,
12
32
  ) -> RbResult<ChunkConfig<tiktoken_rs::CoreBPE>> {
13
- let bpe = get_bpe_from_model(model).map_err(|e| {
14
- Error::new(
15
- ruby.exception_arg_error(),
16
- format!("unrecognized tiktoken model '{model}': {e}"),
17
- )
18
- })?;
33
+ let bpe = resolve_bpe(ruby, model)?;
19
34
 
20
35
  let config = ChunkConfig::new(capacity)
21
36
  .with_overlap(overlap)
@@ -30,60 +45,147 @@ fn build_chunk_config(
30
45
  Ok(config)
31
46
  }
32
47
 
33
- /// Splits unstructured text along Unicode boundaries (sentences, words,
34
- /// grapheme clusters). Works well on transcripts and content with no
35
- /// paragraph or section structure.
36
- #[magnus::wrap(class = "Tomos::Text", free_immediately, size)]
37
- struct RbText {
38
- splitter: TextSplitter<tiktoken_rs::CoreBPE>,
48
+ /// Produce a hex-encoded SHA-256 digest of `text` for use as a chunk identifier.
49
+ fn chunk_id(text: &str) -> String {
50
+ let mut hasher = Sha256::new();
51
+ hasher.update(text.as_bytes());
52
+ format!("{:x}", hasher.finalize())
39
53
  }
40
54
 
41
- impl RbText {
42
- fn new(ruby: &Ruby, model: String, capacity: usize, overlap: usize) -> RbResult<Self> {
43
- let config = build_chunk_config(ruby, &model, capacity, overlap)?;
44
- Ok(Self {
45
- splitter: TextSplitter::new(config),
46
- })
55
+ /// Collect splitter output into a Ruby array of [`RbChunk`] objects.
56
+ fn collect_chunks<'a>(
57
+ ruby: &Ruby,
58
+ iter: impl Iterator<Item = (usize, &'a str)>,
59
+ bpe: &tiktoken_rs::CoreBPE,
60
+ ) -> magnus::RArray {
61
+ let ary = ruby.ary_new();
62
+ for (offset, slice) in iter {
63
+ let _ = ary.push(RbChunk {
64
+ text: slice.to_owned(),
65
+ token_count: bpe.encode_ordinary(slice).len(),
66
+ byte_offset: offset,
67
+ byte_length: slice.len(),
68
+ chunk_id: chunk_id(slice),
69
+ });
47
70
  }
71
+ ary
72
+ }
48
73
 
49
- fn chunks(&self, text: String) -> Vec<String> {
50
- self.splitter.chunks(&text).map(str::to_owned).collect()
74
+ /// Count tokens in `text` using the given BPE tokenizer.
75
+ fn count_tokens_with_bpe(bpe: &tiktoken_rs::CoreBPE, text: &str) -> usize {
76
+ if text.is_empty() {
77
+ return 0;
51
78
  }
79
+ bpe.encode_ordinary(text).len()
80
+ }
81
+
82
+ /// Standalone token-counting entry point exposed as a class-level method on
83
+ /// both `Tomos::Text` and `Tomos::Markdown`.
84
+ fn count_tokens(ruby: &Ruby, text: String, model: String) -> RbResult<usize> {
85
+ let bpe = resolve_bpe(ruby, &model)?;
86
+ Ok(count_tokens_with_bpe(&bpe, &text))
52
87
  }
53
88
 
54
- /// Splits CommonMark/GFM markdown along structural boundaries (headings,
55
- /// code fences, list items, block elements) in addition to the Unicode
56
- /// levels that `RbText` uses. Degrades gracefully to plain-text splitting
57
- /// when the input contains no markdown structure.
58
- #[magnus::wrap(class = "Tomos::Markdown", free_immediately, size)]
59
- struct RbMarkdown {
60
- splitter: MarkdownSplitter<tiktoken_rs::CoreBPE>,
89
+ /// A single chunk produced by a splitter, exposed to Ruby as `Tomos::Chunk`.
90
+ #[derive(Debug)]
91
+ #[magnus::wrap(class = "Tomos::Chunk", free_immediately, size)]
92
+ struct RbChunk {
93
+ text: String,
94
+ token_count: usize,
95
+ byte_offset: usize,
96
+ byte_length: usize,
97
+ chunk_id: String,
61
98
  }
62
99
 
63
- impl RbMarkdown {
64
- fn new(ruby: &Ruby, model: String, capacity: usize, overlap: usize) -> RbResult<Self> {
65
- let config = build_chunk_config(ruby, &model, capacity, overlap)?;
66
- Ok(Self {
67
- splitter: MarkdownSplitter::new(config),
68
- })
100
+ impl RbChunk {
101
+ fn text(&self) -> &str {
102
+ &self.text
103
+ }
104
+
105
+ fn token_count(&self) -> usize {
106
+ self.token_count
107
+ }
108
+
109
+ fn byte_offset(&self) -> usize {
110
+ self.byte_offset
111
+ }
112
+
113
+ fn byte_length(&self) -> usize {
114
+ self.byte_length
69
115
  }
70
116
 
71
- fn chunks(&self, text: String) -> Vec<String> {
72
- self.splitter.chunks(&text).map(str::to_owned).collect()
117
+ fn chunk_id(&self) -> &str {
118
+ &self.chunk_id
73
119
  }
74
120
  }
75
121
 
122
+ /// Generates a splitter wrapper struct with `new`, `chunks`, and `count_tokens`
123
+ /// methods. Both `RbText` and `RbMarkdown` share identical structure and logic,
124
+ /// differing only in the underlying splitter type.
125
+ macro_rules! define_splitter {
126
+ ($name:ident, $class:literal, $splitter:ty) => {
127
+ #[magnus::wrap(class = $class, free_immediately, size)]
128
+ struct $name {
129
+ splitter: $splitter,
130
+ bpe: tiktoken_rs::CoreBPE,
131
+ }
132
+
133
+ impl $name {
134
+ fn new(
135
+ ruby: &Ruby,
136
+ model: String,
137
+ capacity: usize,
138
+ overlap: usize,
139
+ ) -> RbResult<Self> {
140
+ let config = build_chunk_config(ruby, &model, capacity, overlap)?;
141
+ let bpe = resolve_bpe(ruby, &model)?;
142
+ Ok(Self {
143
+ splitter: <$splitter>::new(config),
144
+ bpe,
145
+ })
146
+ }
147
+
148
+ fn chunks(&self, text: String) -> magnus::RArray {
149
+ let ruby = Ruby::get().expect("chunks called outside Ruby thread");
150
+ collect_chunks(&ruby, self.splitter.chunk_indices(&text), &self.bpe)
151
+ }
152
+
153
+ fn count_tokens(&self, text: String) -> usize {
154
+ count_tokens_with_bpe(&self.bpe, &text)
155
+ }
156
+ }
157
+ };
158
+ }
159
+
160
+ define_splitter!(RbText, "Tomos::Text", TextSplitter<tiktoken_rs::CoreBPE>);
161
+ define_splitter!(
162
+ RbMarkdown,
163
+ "Tomos::Markdown",
164
+ MarkdownSplitter<tiktoken_rs::CoreBPE>
165
+ );
166
+
76
167
  #[magnus::init]
77
168
  fn init(ruby: &Ruby) -> RbResult<()> {
78
169
  let module = ruby.define_module("Tomos")?;
79
170
 
171
+ let chunk = module.define_class("Chunk", ruby.class_object())?;
172
+ chunk.define_method("text", method!(RbChunk::text, 0))?;
173
+ chunk.define_method("token_count", method!(RbChunk::token_count, 0))?;
174
+ chunk.define_method("byte_offset", method!(RbChunk::byte_offset, 0))?;
175
+ chunk.define_method("byte_length", method!(RbChunk::byte_length, 0))?;
176
+ chunk.define_method("chunk_id", method!(RbChunk::chunk_id, 0))?;
177
+
80
178
  let text = module.define_class("Text", ruby.class_object())?;
81
- text.define_singleton_method("new", function!(RbText::new, 3))?;
179
+ text.define_singleton_method("_new", function!(RbText::new, 3))?;
82
180
  text.define_method("chunks", method!(RbText::chunks, 1))?;
181
+ text.define_method("count_tokens", method!(RbText::count_tokens, 1))?;
182
+ text.define_singleton_method("_count_tokens", function!(count_tokens, 2))?;
83
183
 
84
184
  let markdown = module.define_class("Markdown", ruby.class_object())?;
85
- markdown.define_singleton_method("new", function!(RbMarkdown::new, 3))?;
185
+ markdown.define_singleton_method("_new", function!(RbMarkdown::new, 3))?;
86
186
  markdown.define_method("chunks", method!(RbMarkdown::chunks, 1))?;
187
+ markdown.define_method("count_tokens", method!(RbMarkdown::count_tokens, 1))?;
188
+ markdown.define_singleton_method("_count_tokens", function!(count_tokens, 2))?;
87
189
 
88
190
  Ok(())
89
191
  }
Binary file
data/lib/tomos/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tomos
4
- VERSION = "0.1.0"
4
+ VERSION = "0.4.0"
5
5
  end
data/lib/tomos.rb CHANGED
@@ -2,3 +2,25 @@
2
2
 
3
3
  require "tomos/version"
4
4
  require "tomos/tomos"
5
+
6
+ module Tomos
7
+ class Text
8
+ def self.new(model:, capacity:, overlap: 0)
9
+ _new(model, capacity, overlap)
10
+ end
11
+
12
+ def self.count_tokens(text, model:)
13
+ _count_tokens(text, model)
14
+ end
15
+ end
16
+
17
+ class Markdown
18
+ def self.new(model:, capacity:, overlap: 0)
19
+ _new(model, capacity, overlap)
20
+ end
21
+
22
+ def self.count_tokens(text, model:)
23
+ _count_tokens(text, model)
24
+ end
25
+ end
26
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tomos
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dan Frenette
@@ -30,8 +30,10 @@ extensions:
30
30
  - ext/tomos/extconf.rb
31
31
  extra_rdoc_files: []
32
32
  files:
33
+ - CHANGELOG.md
33
34
  - Cargo.lock
34
35
  - Cargo.toml
36
+ - README.md
35
37
  - ext/tomos/Cargo.toml
36
38
  - ext/tomos/extconf.rb
37
39
  - ext/tomos/src/lib.rs