tomos 0.1.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Cargo.lock +66 -0
- data/README.md +105 -0
- data/ext/tomos/Cargo.toml +1 -0
- data/ext/tomos/src/lib.rs +139 -37
- data/lib/tomos/tomos.bundle +0 -0
- data/lib/tomos/version.rb +1 -1
- data/lib/tomos.rb +22 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 929f65b3b0e88a33b57073c3471e9b814774f224beeea97bf0a86ee5f7c019ac
|
|
4
|
+
data.tar.gz: b47cfae4cda36d40642d637dbc16c57656ab6eb59db44259fce3b0456cc05192
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 46f87d29b1a95ce78898bc3beee868e6c0626d5114d90ec264cac62f186660f3faabc4ea6a6f3adfe447e516f4ce3482e630db7dde2b9b0e0482ccb9295d5c30
|
|
7
|
+
data.tar.gz: cd5951e757827d15e16664cfc22b6aadfd8a5e85f43f7456df569a990ab526b1478591c12c0aa5e80ec937f184d49237b670c549d8047a4401afa9c4f0efc2c9
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.4.0
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- `Tomos::Text.count_tokens(text, model:)` and `Tomos::Markdown.count_tokens(text, model:)` class methods for counting tokens without constructing a splitter.
|
|
8
|
+
- `#count_tokens(text)` instance method on both `Tomos::Text` and `Tomos::Markdown`, reusing the BPE tokenizer already resolved at construction time.
|
|
9
|
+
- Both forms return `0` for empty input and raise `ArgumentError` for unrecognized models, consistent with the existing splitter API.
|
|
10
|
+
|
|
11
|
+
### Deprecated
|
|
12
|
+
|
|
13
|
+
Using `Tomos::Text.new(model:, capacity: <very_large_number>)` and reading `chunks(text).first.token_count` as a workaround for token counting is no longer necessary. Use `count_tokens` instead.
|
data/Cargo.lock
CHANGED
|
@@ -89,6 +89,15 @@ version = "2.11.0"
|
|
|
89
89
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
90
90
|
checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
|
|
91
91
|
|
|
92
|
+
[[package]]
|
|
93
|
+
name = "block-buffer"
|
|
94
|
+
version = "0.10.4"
|
|
95
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
96
|
+
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
|
|
97
|
+
dependencies = [
|
|
98
|
+
"generic-array",
|
|
99
|
+
]
|
|
100
|
+
|
|
92
101
|
[[package]]
|
|
93
102
|
name = "bstr"
|
|
94
103
|
version = "1.12.1"
|
|
@@ -135,6 +144,25 @@ dependencies = [
|
|
|
135
144
|
"libm",
|
|
136
145
|
]
|
|
137
146
|
|
|
147
|
+
[[package]]
|
|
148
|
+
name = "cpufeatures"
|
|
149
|
+
version = "0.2.17"
|
|
150
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
151
|
+
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
|
|
152
|
+
dependencies = [
|
|
153
|
+
"libc",
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
[[package]]
|
|
157
|
+
name = "crypto-common"
|
|
158
|
+
version = "0.1.7"
|
|
159
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
160
|
+
checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
|
|
161
|
+
dependencies = [
|
|
162
|
+
"generic-array",
|
|
163
|
+
"typenum",
|
|
164
|
+
]
|
|
165
|
+
|
|
138
166
|
[[package]]
|
|
139
167
|
name = "derive_utils"
|
|
140
168
|
version = "0.15.1"
|
|
@@ -146,6 +174,16 @@ dependencies = [
|
|
|
146
174
|
"syn",
|
|
147
175
|
]
|
|
148
176
|
|
|
177
|
+
[[package]]
|
|
178
|
+
name = "digest"
|
|
179
|
+
version = "0.10.7"
|
|
180
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
181
|
+
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
|
182
|
+
dependencies = [
|
|
183
|
+
"block-buffer",
|
|
184
|
+
"crypto-common",
|
|
185
|
+
]
|
|
186
|
+
|
|
149
187
|
[[package]]
|
|
150
188
|
name = "displaydoc"
|
|
151
189
|
version = "0.2.5"
|
|
@@ -174,6 +212,16 @@ dependencies = [
|
|
|
174
212
|
"regex-syntax",
|
|
175
213
|
]
|
|
176
214
|
|
|
215
|
+
[[package]]
|
|
216
|
+
name = "generic-array"
|
|
217
|
+
version = "0.14.7"
|
|
218
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
219
|
+
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
|
|
220
|
+
dependencies = [
|
|
221
|
+
"typenum",
|
|
222
|
+
"version_check",
|
|
223
|
+
]
|
|
224
|
+
|
|
177
225
|
[[package]]
|
|
178
226
|
name = "getrandom"
|
|
179
227
|
version = "0.3.4"
|
|
@@ -541,6 +589,17 @@ dependencies = [
|
|
|
541
589
|
"syn",
|
|
542
590
|
]
|
|
543
591
|
|
|
592
|
+
[[package]]
|
|
593
|
+
name = "sha2"
|
|
594
|
+
version = "0.10.9"
|
|
595
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
596
|
+
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
|
|
597
|
+
dependencies = [
|
|
598
|
+
"cfg-if",
|
|
599
|
+
"cpufeatures",
|
|
600
|
+
"digest",
|
|
601
|
+
]
|
|
602
|
+
|
|
544
603
|
[[package]]
|
|
545
604
|
name = "shell-words"
|
|
546
605
|
version = "1.1.1"
|
|
@@ -673,10 +732,17 @@ version = "0.1.0"
|
|
|
673
732
|
dependencies = [
|
|
674
733
|
"magnus",
|
|
675
734
|
"rb-sys",
|
|
735
|
+
"sha2",
|
|
676
736
|
"text-splitter",
|
|
677
737
|
"tiktoken-rs",
|
|
678
738
|
]
|
|
679
739
|
|
|
740
|
+
[[package]]
|
|
741
|
+
name = "typenum"
|
|
742
|
+
version = "1.19.0"
|
|
743
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
744
|
+
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
|
|
745
|
+
|
|
680
746
|
[[package]]
|
|
681
747
|
name = "unicase"
|
|
682
748
|
version = "2.9.0"
|
data/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# tomos
|
|
2
|
+
|
|
3
|
+
Token-aware text chunking for RAG pipelines, powered by Rust.
|
|
4
|
+
|
|
5
|
+
Tomos wraps the [`text-splitter`](https://github.com/benbrandt/text-splitter) Rust crate with [tiktoken](https://github.com/zurawiki/tiktoken-rs) tokenization and exposes two splitter classes to Ruby: `Tomos::Text` for plain text and `Tomos::Markdown` for Markdown documents. Each chunk carries its token count, byte position, and a SHA-256 content fingerprint.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
gem "tomos"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Because tomos includes a native Rust extension, you'll need a Rust toolchain installed. The gem compiles on `bundle install`.
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
### Splitting text into chunks
|
|
18
|
+
|
|
19
|
+
```ruby
|
|
20
|
+
splitter = Tomos::Text.new(model: "gpt-4", capacity: 512)
|
|
21
|
+
chunks = splitter.chunks("A long document goes here...")
|
|
22
|
+
|
|
23
|
+
chunks.each do |chunk|
|
|
24
|
+
chunk.text # => String — the chunk content
|
|
25
|
+
chunk.token_count # => Integer — tokens in this chunk
|
|
26
|
+
chunk.byte_offset # => Integer — start position in the original string
|
|
27
|
+
chunk.byte_length # => Integer — byte length of the chunk
|
|
28
|
+
chunk.chunk_id # => String — 64-char SHA-256 hex digest
|
|
29
|
+
end
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
The `capacity` is the maximum number of tokens per chunk. An optional `overlap` keyword shares tokens between adjacent chunks, which helps preserve context at boundaries:
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
splitter = Tomos::Text.new(model: "gpt-4", capacity: 512, overlap: 50)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Splitting Markdown
|
|
39
|
+
|
|
40
|
+
`Tomos::Markdown` is Markdown-structure-aware — it respects headers, lists, and code fences when deciding where to split:
|
|
41
|
+
|
|
42
|
+
```ruby
|
|
43
|
+
splitter = Tomos::Markdown.new(model: "gpt-4", capacity: 512)
|
|
44
|
+
chunks = splitter.chunks(File.read("document.md"))
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Note: tokenization is over the raw input string regardless of splitter type; `Markdown` differs only in where it chooses split boundaries.
|
|
48
|
+
|
|
49
|
+
### Counting tokens
|
|
50
|
+
|
|
51
|
+
Count tokens directly without constructing a splitter:
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
# Class method — resolves the tokenizer fresh each call
|
|
55
|
+
Tomos::Text.count_tokens("Hello, world!", model: "gpt-4")
|
|
56
|
+
# => 4
|
|
57
|
+
|
|
58
|
+
Tomos::Markdown.count_tokens("# Hello\n\nWorld", model: "gpt-4")
|
|
59
|
+
# => 4
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
If you already have a splitter instance, the instance method reuses its already-resolved tokenizer:
|
|
63
|
+
|
|
64
|
+
```ruby
|
|
65
|
+
splitter = Tomos::Text.new(model: "gpt-4", capacity: 512)
|
|
66
|
+
splitter.count_tokens("Hello, world!")
|
|
67
|
+
# => 4
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Both forms return `0` for empty input and raise `ArgumentError` for unrecognized model names.
|
|
71
|
+
|
|
72
|
+
### Supported models
|
|
73
|
+
|
|
74
|
+
Any model name recognized by tiktoken, including:
|
|
75
|
+
|
|
76
|
+
- `gpt-4`, `gpt-4o`, `gpt-4.1`, `gpt-5`
|
|
77
|
+
- `o1`, `o3`, `o4` and their versioned variants (e.g. `o1-mini`, `gpt-4o-2024-05-13`)
|
|
78
|
+
- `gpt-3.5-turbo`
|
|
79
|
+
- `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`
|
|
80
|
+
|
|
81
|
+
Unrecognized model names raise `ArgumentError`.
|
|
82
|
+
|
|
83
|
+
## Chunk metadata
|
|
84
|
+
|
|
85
|
+
Each `Tomos::Chunk` exposes:
|
|
86
|
+
|
|
87
|
+
| Method | Type | Description |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| `text` | `String` | The chunk content |
|
|
90
|
+
| `token_count` | `Integer` | Number of tokens in this chunk |
|
|
91
|
+
| `byte_offset` | `Integer` | Start byte position in the source string |
|
|
92
|
+
| `byte_length` | `Integer` | Byte length of the chunk |
|
|
93
|
+
| `chunk_id` | `String` | 64-char lowercase SHA-256 hex digest of the chunk text |
|
|
94
|
+
|
|
95
|
+
The byte metadata lets you map a chunk back to its exact position in the source:
|
|
96
|
+
|
|
97
|
+
```ruby
|
|
98
|
+
source[chunk.byte_offset, chunk.byte_length] == chunk.text # => true
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
The `chunk_id` is deterministic — the same text always produces the same ID, regardless of model, capacity, or overlap.
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT
|
data/ext/tomos/Cargo.toml
CHANGED
data/ext/tomos/src/lib.rs
CHANGED
|
@@ -1,21 +1,36 @@
|
|
|
1
|
+
//! Tomos – Ruby-native text chunking backed by [`text_splitter`] and [`tiktoken_rs`].
|
|
2
|
+
//!
|
|
3
|
+
//! Exposes `Tomos::Text` and `Tomos::Markdown` to Ruby via Magnus. Each wraps a
|
|
4
|
+
//! token-aware splitter that produces `Tomos::Chunk` objects carrying metadata
|
|
5
|
+
//! (token count, byte offset/length, SHA-256 chunk ID).
|
|
6
|
+
|
|
1
7
|
use magnus::{function, method, prelude::*, Error, Ruby};
|
|
8
|
+
use sha2::{Digest, Sha256};
|
|
2
9
|
use text_splitter::{ChunkConfig, MarkdownSplitter, TextSplitter};
|
|
3
10
|
use tiktoken_rs::get_bpe_from_model;
|
|
4
11
|
|
|
5
12
|
type RbResult<T> = Result<T, Error>;
|
|
6
13
|
|
|
14
|
+
/// Resolve a tiktoken BPE tokenizer by model name, mapping unknown models to
|
|
15
|
+
/// a Ruby `ArgumentError`.
|
|
16
|
+
fn resolve_bpe(ruby: &Ruby, model: &str) -> RbResult<tiktoken_rs::CoreBPE> {
|
|
17
|
+
get_bpe_from_model(model).map_err(|e| {
|
|
18
|
+
Error::new(
|
|
19
|
+
ruby.exception_arg_error(),
|
|
20
|
+
format!("unrecognized tiktoken model '{model}': {e}"),
|
|
21
|
+
)
|
|
22
|
+
})
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/// Build a [`ChunkConfig`] with the given token `capacity` and `overlap`,
|
|
26
|
+
/// sized by the BPE tokenizer for `model`.
|
|
7
27
|
fn build_chunk_config(
|
|
8
28
|
ruby: &Ruby,
|
|
9
29
|
model: &str,
|
|
10
30
|
capacity: usize,
|
|
11
31
|
overlap: usize,
|
|
12
32
|
) -> RbResult<ChunkConfig<tiktoken_rs::CoreBPE>> {
|
|
13
|
-
let bpe =
|
|
14
|
-
Error::new(
|
|
15
|
-
ruby.exception_arg_error(),
|
|
16
|
-
format!("unrecognized tiktoken model '{model}': {e}"),
|
|
17
|
-
)
|
|
18
|
-
})?;
|
|
33
|
+
let bpe = resolve_bpe(ruby, model)?;
|
|
19
34
|
|
|
20
35
|
let config = ChunkConfig::new(capacity)
|
|
21
36
|
.with_overlap(overlap)
|
|
@@ -30,60 +45,147 @@ fn build_chunk_config(
|
|
|
30
45
|
Ok(config)
|
|
31
46
|
}
|
|
32
47
|
|
|
33
|
-
///
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
splitter: TextSplitter<tiktoken_rs::CoreBPE>,
|
|
48
|
+
/// Produce a hex-encoded SHA-256 digest of `text` for use as a chunk identifier.
|
|
49
|
+
fn chunk_id(text: &str) -> String {
|
|
50
|
+
let mut hasher = Sha256::new();
|
|
51
|
+
hasher.update(text.as_bytes());
|
|
52
|
+
format!("{:x}", hasher.finalize())
|
|
39
53
|
}
|
|
40
54
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
55
|
+
/// Collect splitter output into a Ruby array of [`RbChunk`] objects.
|
|
56
|
+
fn collect_chunks<'a>(
|
|
57
|
+
ruby: &Ruby,
|
|
58
|
+
iter: impl Iterator<Item = (usize, &'a str)>,
|
|
59
|
+
bpe: &tiktoken_rs::CoreBPE,
|
|
60
|
+
) -> magnus::RArray {
|
|
61
|
+
let ary = ruby.ary_new();
|
|
62
|
+
for (offset, slice) in iter {
|
|
63
|
+
let _ = ary.push(RbChunk {
|
|
64
|
+
text: slice.to_owned(),
|
|
65
|
+
token_count: bpe.encode_ordinary(slice).len(),
|
|
66
|
+
byte_offset: offset,
|
|
67
|
+
byte_length: slice.len(),
|
|
68
|
+
chunk_id: chunk_id(slice),
|
|
69
|
+
});
|
|
47
70
|
}
|
|
71
|
+
ary
|
|
72
|
+
}
|
|
48
73
|
|
|
49
|
-
|
|
50
|
-
|
|
74
|
+
/// Count tokens in `text` using the given BPE tokenizer.
|
|
75
|
+
fn count_tokens_with_bpe(bpe: &tiktoken_rs::CoreBPE, text: &str) -> usize {
|
|
76
|
+
if text.is_empty() {
|
|
77
|
+
return 0;
|
|
51
78
|
}
|
|
79
|
+
bpe.encode_ordinary(text).len()
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/// Standalone token-counting entry point exposed as a class-level method on
|
|
83
|
+
/// both `Tomos::Text` and `Tomos::Markdown`.
|
|
84
|
+
fn count_tokens(ruby: &Ruby, text: String, model: String) -> RbResult<usize> {
|
|
85
|
+
let bpe = resolve_bpe(ruby, &model)?;
|
|
86
|
+
Ok(count_tokens_with_bpe(&bpe, &text))
|
|
52
87
|
}
|
|
53
88
|
|
|
54
|
-
///
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
89
|
+
/// A single chunk produced by a splitter, exposed to Ruby as `Tomos::Chunk`.
|
|
90
|
+
#[derive(Debug)]
|
|
91
|
+
#[magnus::wrap(class = "Tomos::Chunk", free_immediately, size)]
|
|
92
|
+
struct RbChunk {
|
|
93
|
+
text: String,
|
|
94
|
+
token_count: usize,
|
|
95
|
+
byte_offset: usize,
|
|
96
|
+
byte_length: usize,
|
|
97
|
+
chunk_id: String,
|
|
61
98
|
}
|
|
62
99
|
|
|
63
|
-
impl
|
|
64
|
-
fn
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
100
|
+
impl RbChunk {
|
|
101
|
+
fn text(&self) -> &str {
|
|
102
|
+
&self.text
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
fn token_count(&self) -> usize {
|
|
106
|
+
self.token_count
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
fn byte_offset(&self) -> usize {
|
|
110
|
+
self.byte_offset
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
fn byte_length(&self) -> usize {
|
|
114
|
+
self.byte_length
|
|
69
115
|
}
|
|
70
116
|
|
|
71
|
-
fn
|
|
72
|
-
self.
|
|
117
|
+
fn chunk_id(&self) -> &str {
|
|
118
|
+
&self.chunk_id
|
|
73
119
|
}
|
|
74
120
|
}
|
|
75
121
|
|
|
122
|
+
/// Generates a splitter wrapper struct with `new`, `chunks`, and `count_tokens`
|
|
123
|
+
/// methods. Both `RbText` and `RbMarkdown` share identical structure and logic,
|
|
124
|
+
/// differing only in the underlying splitter type.
|
|
125
|
+
macro_rules! define_splitter {
|
|
126
|
+
($name:ident, $class:literal, $splitter:ty) => {
|
|
127
|
+
#[magnus::wrap(class = $class, free_immediately, size)]
|
|
128
|
+
struct $name {
|
|
129
|
+
splitter: $splitter,
|
|
130
|
+
bpe: tiktoken_rs::CoreBPE,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
impl $name {
|
|
134
|
+
fn new(
|
|
135
|
+
ruby: &Ruby,
|
|
136
|
+
model: String,
|
|
137
|
+
capacity: usize,
|
|
138
|
+
overlap: usize,
|
|
139
|
+
) -> RbResult<Self> {
|
|
140
|
+
let config = build_chunk_config(ruby, &model, capacity, overlap)?;
|
|
141
|
+
let bpe = resolve_bpe(ruby, &model)?;
|
|
142
|
+
Ok(Self {
|
|
143
|
+
splitter: <$splitter>::new(config),
|
|
144
|
+
bpe,
|
|
145
|
+
})
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
fn chunks(&self, text: String) -> magnus::RArray {
|
|
149
|
+
let ruby = Ruby::get().expect("chunks called outside Ruby thread");
|
|
150
|
+
collect_chunks(&ruby, self.splitter.chunk_indices(&text), &self.bpe)
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
fn count_tokens(&self, text: String) -> usize {
|
|
154
|
+
count_tokens_with_bpe(&self.bpe, &text)
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
define_splitter!(RbText, "Tomos::Text", TextSplitter<tiktoken_rs::CoreBPE>);
|
|
161
|
+
define_splitter!(
|
|
162
|
+
RbMarkdown,
|
|
163
|
+
"Tomos::Markdown",
|
|
164
|
+
MarkdownSplitter<tiktoken_rs::CoreBPE>
|
|
165
|
+
);
|
|
166
|
+
|
|
76
167
|
#[magnus::init]
|
|
77
168
|
fn init(ruby: &Ruby) -> RbResult<()> {
|
|
78
169
|
let module = ruby.define_module("Tomos")?;
|
|
79
170
|
|
|
171
|
+
let chunk = module.define_class("Chunk", ruby.class_object())?;
|
|
172
|
+
chunk.define_method("text", method!(RbChunk::text, 0))?;
|
|
173
|
+
chunk.define_method("token_count", method!(RbChunk::token_count, 0))?;
|
|
174
|
+
chunk.define_method("byte_offset", method!(RbChunk::byte_offset, 0))?;
|
|
175
|
+
chunk.define_method("byte_length", method!(RbChunk::byte_length, 0))?;
|
|
176
|
+
chunk.define_method("chunk_id", method!(RbChunk::chunk_id, 0))?;
|
|
177
|
+
|
|
80
178
|
let text = module.define_class("Text", ruby.class_object())?;
|
|
81
|
-
text.define_singleton_method("
|
|
179
|
+
text.define_singleton_method("_new", function!(RbText::new, 3))?;
|
|
82
180
|
text.define_method("chunks", method!(RbText::chunks, 1))?;
|
|
181
|
+
text.define_method("count_tokens", method!(RbText::count_tokens, 1))?;
|
|
182
|
+
text.define_singleton_method("_count_tokens", function!(count_tokens, 2))?;
|
|
83
183
|
|
|
84
184
|
let markdown = module.define_class("Markdown", ruby.class_object())?;
|
|
85
|
-
markdown.define_singleton_method("
|
|
185
|
+
markdown.define_singleton_method("_new", function!(RbMarkdown::new, 3))?;
|
|
86
186
|
markdown.define_method("chunks", method!(RbMarkdown::chunks, 1))?;
|
|
187
|
+
markdown.define_method("count_tokens", method!(RbMarkdown::count_tokens, 1))?;
|
|
188
|
+
markdown.define_singleton_method("_count_tokens", function!(count_tokens, 2))?;
|
|
87
189
|
|
|
88
190
|
Ok(())
|
|
89
191
|
}
|
data/lib/tomos/tomos.bundle
CHANGED
|
Binary file
|
data/lib/tomos/version.rb
CHANGED
data/lib/tomos.rb
CHANGED
|
@@ -2,3 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
require "tomos/version"
|
|
4
4
|
require "tomos/tomos"
|
|
5
|
+
|
|
6
|
+
module Tomos
|
|
7
|
+
class Text
|
|
8
|
+
def self.new(model:, capacity:, overlap: 0)
|
|
9
|
+
_new(model, capacity, overlap)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def self.count_tokens(text, model:)
|
|
13
|
+
_count_tokens(text, model)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
class Markdown
|
|
18
|
+
def self.new(model:, capacity:, overlap: 0)
|
|
19
|
+
_new(model, capacity, overlap)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def self.count_tokens(text, model:)
|
|
23
|
+
_count_tokens(text, model)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tomos
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dan Frenette
|
|
@@ -30,8 +30,10 @@ extensions:
|
|
|
30
30
|
- ext/tomos/extconf.rb
|
|
31
31
|
extra_rdoc_files: []
|
|
32
32
|
files:
|
|
33
|
+
- CHANGELOG.md
|
|
33
34
|
- Cargo.lock
|
|
34
35
|
- Cargo.toml
|
|
36
|
+
- README.md
|
|
35
37
|
- ext/tomos/Cargo.toml
|
|
36
38
|
- ext/tomos/extconf.rb
|
|
37
39
|
- ext/tomos/src/lib.rs
|