tokenizer-ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CLAUDE.md +186 -0
- data/LICENSE +21 -0
- data/ext/tokenizer_ruby/Cargo.toml +12 -0
- data/ext/tokenizer_ruby/extconf.rb +6 -0
- data/ext/tokenizer_ruby/src/lib.rs +125 -0
- data/lib/tokenizer_ruby/encoding.rb +18 -0
- data/lib/tokenizer_ruby/tokenizer.rb +81 -0
- data/lib/tokenizer_ruby/version.rb +5 -0
- data/lib/tokenizer_ruby.rb +16 -0
- metadata +65 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: '08ba527f839b738f57491cffd8490e9ff6eef4c34d0be6754ba987f4621d84b6'
|
|
4
|
+
data.tar.gz: 91bef2ba51e51178bd045397c4374669842e90b309d2a65d423cbc6267f2b7b7
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: dc5e34e66506f9f798526e36f923f75a06f27ac75e16c194bd25f5c2421270dcb2fa511b36a3e4dacd835f9036c76fd5959ace8de1291f15ebe624f9093c30e3
|
|
7
|
+
data.tar.gz: '083fb7459ff8cc61d5b50fcc085bc3dea7703b84d593004a8659d2db5214308a95a6ca41d685f78baab798ec07ef8e045e587e98340319c32757b1994afe565c'
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# tokenizer-ruby
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
|
|
5
|
+
Ruby bindings for HuggingFace's [tokenizers](https://github.com/huggingface/tokenizers) library. The tokenizers library is written in **Rust**, so we use **FFI** (not Rice) to bind to the C API that the Rust library exposes via `tokenizers-c`.
|
|
6
|
+
|
|
7
|
+
The gem should allow Ruby developers to tokenize text using any HuggingFace tokenizer (GPT, BERT, LLaMA, Claude, etc.) with fast Rust-powered performance.
|
|
8
|
+
|
|
9
|
+
## Author
|
|
10
|
+
|
|
11
|
+
- Name: Johannes Dwi Cahyo
|
|
12
|
+
- GitHub: johannesdwicahyo
|
|
13
|
+
- Repo: git@github.com:johannesdwicahyo/tokenizer-ruby.git
|
|
14
|
+
|
|
15
|
+
## Technical Approach
|
|
16
|
+
|
|
17
|
+
### Binding Strategy: FFI via `ffi` gem
|
|
18
|
+
|
|
19
|
+
Unlike zvec-ruby (which used Rice for C++ bindings), this gem wraps a **Rust** library. The recommended approach:
|
|
20
|
+
|
|
21
|
+
1. Use the [`tokenizers` Rust crate](https://github.com/huggingface/tokenizers) which provides C bindings
|
|
22
|
+
2. Build a small Rust cdylib that exposes a C API for the functions we need
|
|
23
|
+
3. Use Ruby's `ffi` gem to call into the compiled `.dylib`/`.so`
|
|
24
|
+
4. Ship precompiled binaries for common platforms (like zvec-ruby does)
|
|
25
|
+
|
|
26
|
+
Alternative: use [rb-sys](https://github.com/oxidize-rb/rb-sys) + [magnus](https://github.com/matsadler/magnus) for direct Rust→Ruby bindings (more idiomatic but more complex). **Prefer this approach** if feasible — it's the modern Rust-Ruby binding standard.
|
|
27
|
+
|
|
28
|
+
### If using magnus/rb-sys:
|
|
29
|
+
```toml
|
|
30
|
+
# Cargo.toml
|
|
31
|
+
[dependencies]
|
|
32
|
+
magnus = "0.7"
|
|
33
|
+
tokenizers = "0.21"
|
|
34
|
+
|
|
35
|
+
[lib]
|
|
36
|
+
crate-type = ["cdylib"]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### If using FFI:
|
|
40
|
+
Build a thin C wrapper around the Rust tokenizers crate, then use `ffi` gem.
|
|
41
|
+
|
|
42
|
+
## Core API Design
|
|
43
|
+
|
|
44
|
+
```ruby
|
|
45
|
+
require "tokenizer_ruby"
|
|
46
|
+
|
|
47
|
+
# Load from HuggingFace hub (downloads and caches)
|
|
48
|
+
tokenizer = TokenizerRuby::Tokenizer.from_pretrained("gpt2")
|
|
49
|
+
tokenizer = TokenizerRuby::Tokenizer.from_pretrained("bert-base-uncased")
|
|
50
|
+
tokenizer = TokenizerRuby::Tokenizer.from_pretrained("meta-llama/Llama-3-8B")
|
|
51
|
+
|
|
52
|
+
# Load from local file
|
|
53
|
+
tokenizer = TokenizerRuby::Tokenizer.from_file("/path/to/tokenizer.json")
|
|
54
|
+
|
|
55
|
+
# Basic encoding
|
|
56
|
+
encoding = tokenizer.encode("Hello, world!")
|
|
57
|
+
encoding.ids # => [15496, 11, 995, 0]
|
|
58
|
+
encoding.tokens # => ["Hello", ",", " world", "!"]
|
|
59
|
+
encoding.offsets # => [[0,5], [5,6], [6,12], [12,13]]
|
|
60
|
+
encoding.length # => 4
|
|
61
|
+
|
|
62
|
+
# Decode
|
|
63
|
+
tokenizer.decode([15496, 11, 995, 0]) # => "Hello, world!"
|
|
64
|
+
|
|
65
|
+
# Batch encoding
|
|
66
|
+
encodings = tokenizer.encode_batch(["Hello", "World"])
|
|
67
|
+
|
|
68
|
+
# Token counting (most common use case)
|
|
69
|
+
tokenizer.count("Some text here") # => 3
|
|
70
|
+
|
|
71
|
+
# Truncation and padding
|
|
72
|
+
tokenizer.enable_truncation(max_length: 512)
|
|
73
|
+
tokenizer.enable_padding(length: 512, pad_token: "[PAD]")
|
|
74
|
+
|
|
75
|
+
# Vocab
|
|
76
|
+
tokenizer.vocab_size # => 50257
|
|
77
|
+
tokenizer.token_to_id("hello") # => 31373
|
|
78
|
+
tokenizer.id_to_token(31373) # => "hello"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Features to Implement
|
|
82
|
+
|
|
83
|
+
### Phase 1 — Core (MVP)
|
|
84
|
+
- [ ] `from_pretrained(model_name)` — download from HuggingFace hub
|
|
85
|
+
- [ ] `from_file(path)` — load from local tokenizer.json
|
|
86
|
+
- [ ] `encode(text)` → Encoding (ids, tokens, offsets, attention_mask)
|
|
87
|
+
- [ ] `decode(ids)` → String
|
|
88
|
+
- [ ] `encode_batch(texts)` → Array of Encodings
|
|
89
|
+
- [ ] `decode_batch(ids_array)` → Array of Strings
|
|
90
|
+
- [ ] `vocab_size`
|
|
91
|
+
- [ ] `token_to_id(token)` / `id_to_token(id)`
|
|
92
|
+
|
|
93
|
+
### Phase 2 — Convenience
|
|
94
|
+
- [ ] `count(text)` — token count helper
|
|
95
|
+
- [ ] `truncate(text, max_tokens:)` — truncate to token limit
|
|
96
|
+
- [ ] `enable_truncation(max_length:)`
|
|
97
|
+
- [ ] `enable_padding(length:, pad_token:)`
|
|
98
|
+
- [ ] Thread safety (Rust tokenizer is Send+Sync)
|
|
99
|
+
|
|
100
|
+
### Phase 3 — Rails Integration
|
|
101
|
+
- [ ] `TokenizerRuby.default_tokenizer = "gpt2"` global config
|
|
102
|
+
- [ ] ActiveModel validator: `validates :content, token_length: { maximum: 4096 }`
|
|
103
|
+
- [ ] Caching of downloaded tokenizer files
|
|
104
|
+
|
|
105
|
+
## Project Structure
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
tokenizer-ruby/
|
|
109
|
+
├── CLAUDE.md
|
|
110
|
+
├── Gemfile
|
|
111
|
+
├── Rakefile
|
|
112
|
+
├── LICENSE # MIT
|
|
113
|
+
├── README.md
|
|
114
|
+
├── tokenizer-ruby.gemspec
|
|
115
|
+
├── lib/
|
|
116
|
+
│ ├── tokenizer_ruby.rb
|
|
117
|
+
│ ├── tokenizer_ruby/
|
|
118
|
+
│ │ ├── version.rb
|
|
119
|
+
│ │ ├── tokenizer.rb
|
|
120
|
+
│ │ └── encoding.rb
|
|
121
|
+
├── ext/
|
|
122
|
+
│ └── tokenizer_ruby/
|
|
123
|
+
│ ├── Cargo.toml
|
|
124
|
+
│ ├── extconf.rb
|
|
125
|
+
│ └── src/
|
|
126
|
+
│ └── lib.rs
|
|
127
|
+
├── test/
|
|
128
|
+
│ ├── test_helper.rb
|
|
129
|
+
│ ├── test_tokenizer.rb
|
|
130
|
+
│ └── test_encoding.rb
|
|
131
|
+
└── script/
|
|
132
|
+
└── package_native_gem.rb
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Dependencies
|
|
136
|
+
|
|
137
|
+
### Runtime
|
|
138
|
+
- `rb_sys` (if using magnus) OR `ffi` (if using FFI approach)
|
|
139
|
+
|
|
140
|
+
### Build
|
|
141
|
+
- Rust toolchain (cargo, rustc)
|
|
142
|
+
- `rake-compiler` for building native extensions
|
|
143
|
+
|
|
144
|
+
### Development
|
|
145
|
+
- `minitest` for testing
|
|
146
|
+
- `rake` for tasks
|
|
147
|
+
|
|
148
|
+
## Precompiled Gems
|
|
149
|
+
|
|
150
|
+
Follow the same pattern as zvec-ruby:
|
|
151
|
+
- Build for x86_64-linux, aarch64-linux, x86_64-darwin, arm64-darwin
|
|
152
|
+
- Ruby versions 3.1, 3.2, 3.3, 3.4
|
|
153
|
+
- GitHub Actions workflow for automated builds on tag push
|
|
154
|
+
- Users just `gem install tokenizer-ruby` with zero build dependencies
|
|
155
|
+
|
|
156
|
+
## Key References
|
|
157
|
+
|
|
158
|
+
- HuggingFace tokenizers repo: https://github.com/huggingface/tokenizers
|
|
159
|
+
- tokenizers Rust crate docs: https://docs.rs/tokenizers
|
|
160
|
+
- magnus (Rust-Ruby bindings): https://github.com/matsadler/magnus
|
|
161
|
+
- rb-sys: https://github.com/oxidize-rb/rb-sys
|
|
162
|
+
- Existing Ruby tokenizer gems (for API inspiration, not wrapping):
|
|
163
|
+
- tiktoken_ruby (OpenAI only)
|
|
164
|
+
- tokenizers (Python, the reference API)
|
|
165
|
+
|
|
166
|
+
## Testing Strategy
|
|
167
|
+
|
|
168
|
+
- Unit tests with known tokenizer outputs (use `gpt2` as reference tokenizer)
|
|
169
|
+
- Test with multiple model tokenizers (GPT-2, BERT, LLaMA)
|
|
170
|
+
- Test batch encoding/decoding
|
|
171
|
+
- Test special tokens handling
|
|
172
|
+
- Test unicode and multilingual text
|
|
173
|
+
- Test thread safety with concurrent encoding
|
|
174
|
+
|
|
175
|
+
## Publishing
|
|
176
|
+
|
|
177
|
+
- RubyGems.org: `gem push tokenizer-ruby-*.gem`
|
|
178
|
+
- gem.coop: `GEM_HOST_API_KEY=hjncPswY8PbGDfLPw4RMj928 gem push tokenizer-ruby-*.gem --host https://beta.gem.coop/@johannesdwicahyo`
|
|
179
|
+
|
|
180
|
+
## Notes from zvec-ruby Experience
|
|
181
|
+
|
|
182
|
+
- Precompiled native gems are essential for adoption — nobody wants to install build tools
|
|
183
|
+
- Use `script/package_native_gem.rb` pattern for macOS builds
|
|
184
|
+
- Use `rake-compiler-dock` for Linux cross-compilation
|
|
185
|
+
- Test on multiple Ruby versions (3.1–3.4)
|
|
186
|
+
- Keep the Ruby API clean and idiomatic — hide C/Rust complexity behind a nice DSL
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Johannes Dwi Cahyo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
use magnus::{
|
|
2
|
+
define_module, function, method, prelude::*, Error, RHash, Ruby,
|
|
3
|
+
RArray,
|
|
4
|
+
};
|
|
5
|
+
use std::cell::RefCell;
|
|
6
|
+
use tokenizers::Tokenizer;
|
|
7
|
+
|
|
8
|
+
#[magnus::wrap(class = "TokenizerRuby::InternalTokenizer", free_immediately)]
|
|
9
|
+
struct RubyTokenizer(RefCell<Tokenizer>);
|
|
10
|
+
|
|
11
|
+
fn from_pretrained(ruby: &Ruby, identifier: String) -> Result<RubyTokenizer, Error> {
|
|
12
|
+
let tokenizer = Tokenizer::from_pretrained(&identifier, None)
|
|
13
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
14
|
+
Ok(RubyTokenizer(RefCell::new(tokenizer)))
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
fn from_file(ruby: &Ruby, path: String) -> Result<RubyTokenizer, Error> {
|
|
18
|
+
let tokenizer = Tokenizer::from_file(&path)
|
|
19
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
20
|
+
Ok(RubyTokenizer(RefCell::new(tokenizer)))
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
fn encoding_to_hash(ruby: &Ruby, encoding: &tokenizers::Encoding) -> Result<RHash, Error> {
|
|
24
|
+
let hash = RHash::new();
|
|
25
|
+
|
|
26
|
+
let ids: Vec<i64> = encoding.get_ids().iter().map(|&id| id as i64).collect();
|
|
27
|
+
hash.aset(ruby.sym_new("ids"), RArray::from_vec(ids))?;
|
|
28
|
+
|
|
29
|
+
let tokens: Vec<String> = encoding.get_tokens().to_vec();
|
|
30
|
+
hash.aset(ruby.sym_new("tokens"), RArray::from_vec(tokens))?;
|
|
31
|
+
|
|
32
|
+
let offsets_array = RArray::new();
|
|
33
|
+
for &(start, end_) in encoding.get_offsets() {
|
|
34
|
+
offsets_array.push(RArray::from_vec(vec![start as i64, end_ as i64]))?;
|
|
35
|
+
}
|
|
36
|
+
hash.aset(ruby.sym_new("offsets"), offsets_array)?;
|
|
37
|
+
|
|
38
|
+
let mask: Vec<i64> = encoding.get_attention_mask().iter().map(|&m| m as i64).collect();
|
|
39
|
+
hash.aset(ruby.sym_new("attention_mask"), RArray::from_vec(mask))?;
|
|
40
|
+
|
|
41
|
+
Ok(hash)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
impl RubyTokenizer {
|
|
45
|
+
fn encode(ruby: &Ruby, rb_self: &Self, text: String) -> Result<RHash, Error> {
|
|
46
|
+
let encoding = rb_self.0.borrow()
|
|
47
|
+
.encode(text.as_str(), false)
|
|
48
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
49
|
+
encoding_to_hash(ruby, &encoding)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
fn decode(ruby: &Ruby, rb_self: &Self, ids: Vec<u32>) -> Result<String, Error> {
|
|
53
|
+
rb_self.0.borrow()
|
|
54
|
+
.decode(&ids, true)
|
|
55
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
fn encode_batch(ruby: &Ruby, rb_self: &Self, texts: Vec<String>) -> Result<RArray, Error> {
|
|
59
|
+
let inputs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
|
|
60
|
+
let encodings = rb_self.0.borrow()
|
|
61
|
+
.encode_batch(inputs, false)
|
|
62
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
63
|
+
let result = RArray::new();
|
|
64
|
+
for encoding in &encodings {
|
|
65
|
+
result.push(encoding_to_hash(ruby, encoding)?)?;
|
|
66
|
+
}
|
|
67
|
+
Ok(result)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
fn decode_batch(ruby: &Ruby, rb_self: &Self, ids_array: Vec<Vec<u32>>) -> Result<Vec<String>, Error> {
|
|
71
|
+
let refs: Vec<&[u32]> = ids_array.iter().map(|v| v.as_slice()).collect();
|
|
72
|
+
rb_self.0.borrow()
|
|
73
|
+
.decode_batch(&refs, true)
|
|
74
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
fn vocab_size(&self) -> usize {
|
|
78
|
+
self.0.borrow().get_vocab_size(true)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
fn token_to_id(&self, token: String) -> Option<u32> {
|
|
82
|
+
self.0.borrow().token_to_id(&token)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
fn id_to_token(&self, id: u32) -> Option<String> {
|
|
86
|
+
self.0.borrow().id_to_token(id)
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
fn enable_truncation(&self, max_length: usize) {
|
|
90
|
+
let params = tokenizers::TruncationParams {
|
|
91
|
+
max_length,
|
|
92
|
+
..Default::default()
|
|
93
|
+
};
|
|
94
|
+
let _ = self.0.borrow_mut().with_truncation(Some(params));
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
fn enable_padding(&self, length: usize, pad_token: String) {
|
|
98
|
+
let params = tokenizers::PaddingParams {
|
|
99
|
+
strategy: tokenizers::PaddingStrategy::Fixed(length),
|
|
100
|
+
pad_token,
|
|
101
|
+
..Default::default()
|
|
102
|
+
};
|
|
103
|
+
self.0.borrow_mut().with_padding(Some(params));
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
#[magnus::init]
|
|
108
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
109
|
+
let module = define_module("TokenizerRuby")?;
|
|
110
|
+
|
|
111
|
+
let class = module.define_class("InternalTokenizer", ruby.class_object())?;
|
|
112
|
+
class.define_singleton_method("from_pretrained", function!(from_pretrained, 1))?;
|
|
113
|
+
class.define_singleton_method("from_file", function!(from_file, 1))?;
|
|
114
|
+
class.define_method("_encode", method!(RubyTokenizer::encode, 1))?;
|
|
115
|
+
class.define_method("_decode", method!(RubyTokenizer::decode, 1))?;
|
|
116
|
+
class.define_method("_encode_batch", method!(RubyTokenizer::encode_batch, 1))?;
|
|
117
|
+
class.define_method("_decode_batch", method!(RubyTokenizer::decode_batch, 1))?;
|
|
118
|
+
class.define_method("vocab_size", method!(RubyTokenizer::vocab_size, 0))?;
|
|
119
|
+
class.define_method("token_to_id", method!(RubyTokenizer::token_to_id, 1))?;
|
|
120
|
+
class.define_method("id_to_token", method!(RubyTokenizer::id_to_token, 1))?;
|
|
121
|
+
class.define_method("_enable_truncation", method!(RubyTokenizer::enable_truncation, 1))?;
|
|
122
|
+
class.define_method("_enable_padding", method!(RubyTokenizer::enable_padding, 2))?;
|
|
123
|
+
|
|
124
|
+
Ok(())
|
|
125
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module TokenizerRuby
|
|
4
|
+
class Encoding
|
|
5
|
+
attr_reader :ids, :tokens, :offsets, :attention_mask
|
|
6
|
+
|
|
7
|
+
def initialize(ids:, tokens:, offsets:, attention_mask:)
|
|
8
|
+
@ids = ids
|
|
9
|
+
@tokens = tokens
|
|
10
|
+
@offsets = offsets
|
|
11
|
+
@attention_mask = attention_mask
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def length
|
|
15
|
+
@ids.length
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module TokenizerRuby
|
|
4
|
+
class Tokenizer
|
|
5
|
+
def self.from_pretrained(identifier)
|
|
6
|
+
new(InternalTokenizer.from_pretrained(identifier))
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def self.from_file(path)
|
|
10
|
+
new(InternalTokenizer.from_file(path))
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def encode(text)
|
|
14
|
+
result = @inner._encode(text)
|
|
15
|
+
Encoding.new(
|
|
16
|
+
ids: result[:ids],
|
|
17
|
+
tokens: result[:tokens],
|
|
18
|
+
offsets: result[:offsets],
|
|
19
|
+
attention_mask: result[:attention_mask]
|
|
20
|
+
)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def decode(ids)
|
|
24
|
+
@inner._decode(ids)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def encode_batch(texts)
|
|
28
|
+
results = @inner._encode_batch(texts)
|
|
29
|
+
results.map do |result|
|
|
30
|
+
Encoding.new(
|
|
31
|
+
ids: result[:ids],
|
|
32
|
+
tokens: result[:tokens],
|
|
33
|
+
offsets: result[:offsets],
|
|
34
|
+
attention_mask: result[:attention_mask]
|
|
35
|
+
)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def decode_batch(ids_array)
|
|
40
|
+
@inner._decode_batch(ids_array)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def vocab_size
|
|
44
|
+
@inner.vocab_size
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def token_to_id(token)
|
|
48
|
+
@inner.token_to_id(token)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def id_to_token(id)
|
|
52
|
+
@inner.id_to_token(id)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def count(text)
|
|
56
|
+
encode(text).length
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def truncate(text, max_tokens:)
|
|
60
|
+
encoding = encode(text)
|
|
61
|
+
return text if encoding.length <= max_tokens
|
|
62
|
+
|
|
63
|
+
truncated_ids = encoding.ids[0, max_tokens]
|
|
64
|
+
decode(truncated_ids)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def enable_truncation(max_length:)
|
|
68
|
+
@inner._enable_truncation(max_length)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def enable_padding(length:, pad_token: "[PAD]")
|
|
72
|
+
@inner._enable_padding(length, pad_token)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
def initialize(inner)
|
|
78
|
+
@inner = inner
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "tokenizer_ruby/version"
|
|
4
|
+
require_relative "tokenizer_ruby/encoding"
|
|
5
|
+
require_relative "tokenizer_ruby/tokenizer"
|
|
6
|
+
|
|
7
|
+
begin
|
|
8
|
+
RUBY_VERSION =~ /(\d+\.\d+)/
|
|
9
|
+
require "tokenizer_ruby/#{$1}/tokenizer_ruby"
|
|
10
|
+
rescue LoadError
|
|
11
|
+
require "tokenizer_ruby/tokenizer_ruby"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
module TokenizerRuby
|
|
15
|
+
class Error < StandardError; end
|
|
16
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: tokenizer-ruby
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Johannes Dwi Cahyo
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: rb_sys
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0.9'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0.9'
|
|
26
|
+
description: Fast tokenization for Ruby using HuggingFace's Rust-powered tokenizers
|
|
27
|
+
library. Supports GPT, BERT, LLaMA, Claude, and any HuggingFace tokenizer.
|
|
28
|
+
executables: []
|
|
29
|
+
extensions:
|
|
30
|
+
- ext/tokenizer_ruby/extconf.rb
|
|
31
|
+
extra_rdoc_files: []
|
|
32
|
+
files:
|
|
33
|
+
- CLAUDE.md
|
|
34
|
+
- LICENSE
|
|
35
|
+
- ext/tokenizer_ruby/Cargo.toml
|
|
36
|
+
- ext/tokenizer_ruby/extconf.rb
|
|
37
|
+
- ext/tokenizer_ruby/src/lib.rs
|
|
38
|
+
- lib/tokenizer_ruby.rb
|
|
39
|
+
- lib/tokenizer_ruby/encoding.rb
|
|
40
|
+
- lib/tokenizer_ruby/tokenizer.rb
|
|
41
|
+
- lib/tokenizer_ruby/version.rb
|
|
42
|
+
homepage: https://github.com/johannesdwicahyo/tokenizer-ruby
|
|
43
|
+
licenses:
|
|
44
|
+
- MIT
|
|
45
|
+
metadata:
|
|
46
|
+
homepage_uri: https://github.com/johannesdwicahyo/tokenizer-ruby
|
|
47
|
+
source_code_uri: https://github.com/johannesdwicahyo/tokenizer-ruby
|
|
48
|
+
rdoc_options: []
|
|
49
|
+
require_paths:
|
|
50
|
+
- lib
|
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
52
|
+
requirements:
|
|
53
|
+
- - ">="
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: '3.1'
|
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0'
|
|
61
|
+
requirements: []
|
|
62
|
+
rubygems_version: 3.6.9
|
|
63
|
+
specification_version: 4
|
|
64
|
+
summary: Ruby bindings for HuggingFace tokenizers
|
|
65
|
+
test_files: []
|