tokenizer-ruby 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +1 -1
- data/README.md +107 -0
- data/lib/tokenizer_ruby/encoding.rb +6 -2
- data/lib/tokenizer_ruby/tokenizer.rb +24 -8
- data/lib/tokenizer_ruby/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1eea09dc0997579eab072e4fc3705f8e5e5161110126f80478922ce65732d6a5
|
|
4
|
+
data.tar.gz: 4e6534b235e6477742a5170fc872cca3b59bbb44f32960fcb656a20d70d00c9f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4b7f5258322ef3d76d3a16a889b0e233f0078ed4970fff3b06d63f9d47fe3aeb39690fe370a37d0e52faf08bf808919d2bfe415a22bafa5bc31f38a6bcf70642
|
|
7
|
+
data.tar.gz: d2702b426ec82453372e9452a103921f025d68d44296e466c57e5b86d041a6c5c6b6558087a779767ac02080089a849cfee5262f863b0777331a42e51bb6b15a
|
data/CLAUDE.md
CHANGED
|
@@ -174,7 +174,7 @@ Follow the same pattern as zvec-ruby:
|
|
|
174
174
|
|
|
175
175
|
## Publishing
|
|
176
176
|
|
|
177
|
-
- RubyGems.org: `gem push tokenizer-ruby-*.gem`
|
|
177
|
+
- RubyGems.org: `GEM_HOST_API_KEY=rubygems_5d46e91ceb51fb455e98a7f491a2321bb6879f9be35d6842 gem push tokenizer-ruby-*.gem`
|
|
178
178
|
- gem.coop: `GEM_HOST_API_KEY=hjncPswY8PbGDfLPw4RMj928 gem push tokenizer-ruby-*.gem --host https://beta.gem.coop/@johannesdwicahyo`
|
|
179
179
|
|
|
180
180
|
## Notes from zvec-ruby Experience
|
data/README.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# tokenizer-ruby
|
|
2
|
+
|
|
3
|
+
Ruby bindings for [HuggingFace Tokenizers](https://github.com/huggingface/tokenizers). Fast, Rust-powered tokenization for any HuggingFace model — GPT-2, BERT, LLaMA, Claude, and more.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
gem install tokenizer-ruby
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Or add to your Gemfile:
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
gem "tokenizer-ruby"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
**Note:** Requires Rust toolchain for compilation. Install via [rustup](https://rustup.rs/).
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
### Load a tokenizer
|
|
22
|
+
|
|
23
|
+
```ruby
|
|
24
|
+
require "tokenizer_ruby"
|
|
25
|
+
|
|
26
|
+
# From HuggingFace Hub
|
|
27
|
+
tokenizer = TokenizerRuby::Tokenizer.from_pretrained("gpt2")
|
|
28
|
+
tokenizer = TokenizerRuby::Tokenizer.from_pretrained("bert-base-uncased")
|
|
29
|
+
|
|
30
|
+
# From a local file
|
|
31
|
+
tokenizer = TokenizerRuby::Tokenizer.from_file("/path/to/tokenizer.json")
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Encode and decode
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
encoding = tokenizer.encode("Hello, world!")
|
|
38
|
+
encoding.ids # => [15496, 11, 995, 0]
|
|
39
|
+
encoding.tokens # => ["Hello", ",", " world", "!"]
|
|
40
|
+
encoding.offsets # => [[0, 5], [5, 6], [6, 12], [12, 13]]
|
|
41
|
+
encoding.attention_mask # => [1, 1, 1, 1]
|
|
42
|
+
encoding.length # => 4
|
|
43
|
+
|
|
44
|
+
tokenizer.decode([15496, 11, 995, 0]) # => "Hello, world!"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Batch processing
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
encodings = tokenizer.encode_batch(["Hello", "World"])
|
|
51
|
+
decoded = tokenizer.decode_batch(encodings.map(&:ids))
|
|
52
|
+
# => ["Hello", "World"]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Token counting
|
|
56
|
+
|
|
57
|
+
```ruby
|
|
58
|
+
tokenizer.count("Hello, world!") # => 4
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Truncation
|
|
62
|
+
|
|
63
|
+
```ruby
|
|
64
|
+
# Truncate text to a token limit
|
|
65
|
+
tokenizer.truncate("This is a long sentence...", max_tokens: 5)
|
|
66
|
+
|
|
67
|
+
# Enable automatic truncation on all encodes
|
|
68
|
+
tokenizer.enable_truncation(max_length: 512)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Padding
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
tokenizer.enable_padding(length: 128, pad_token: "[PAD]")
|
|
75
|
+
encoding = tokenizer.encode("Hello")
|
|
76
|
+
encoding.ids.length # => 128
|
|
77
|
+
encoding.attention_mask # => [1, 0, 0, 0, ...]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Vocabulary
|
|
81
|
+
|
|
82
|
+
```ruby
|
|
83
|
+
tokenizer.vocab_size # => 50257
|
|
84
|
+
tokenizer.token_to_id("hello") # => 31373
|
|
85
|
+
tokenizer.id_to_token(31373) # => "hello"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Requirements
|
|
89
|
+
|
|
90
|
+
- Ruby >= 3.1
|
|
91
|
+
- Rust toolchain (for building from source)
|
|
92
|
+
|
|
93
|
+
## Development
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
bundle install
|
|
97
|
+
bundle exec rake compile
|
|
98
|
+
bundle exec rake test
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT
|
|
104
|
+
|
|
105
|
+
## Author
|
|
106
|
+
|
|
107
|
+
Johannes Dwi Cahyo — [@johannesdwicahyo](https://github.com/johannesdwicahyo)
|
|
@@ -2,13 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
module TokenizerRuby
|
|
4
4
|
class Encoding
|
|
5
|
-
attr_reader :ids, :tokens, :offsets, :attention_mask
|
|
5
|
+
attr_reader :ids, :tokens, :offsets, :attention_mask,
|
|
6
|
+
:type_ids, :special_tokens_mask, :word_ids
|
|
6
7
|
|
|
7
|
-
def initialize(ids:, tokens:, offsets:, attention_mask:)
|
|
8
|
+
def initialize(ids:, tokens:, offsets:, attention_mask:, type_ids: nil, special_tokens_mask: nil, word_ids: nil)
|
|
8
9
|
@ids = ids
|
|
9
10
|
@tokens = tokens
|
|
10
11
|
@offsets = offsets
|
|
11
12
|
@attention_mask = attention_mask
|
|
13
|
+
@type_ids = type_ids
|
|
14
|
+
@special_tokens_mask = special_tokens_mask
|
|
15
|
+
@word_ids = word_ids
|
|
12
16
|
end
|
|
13
17
|
|
|
14
18
|
def length
|
|
@@ -2,6 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
module TokenizerRuby
|
|
4
4
|
class Tokenizer
|
|
5
|
+
def initialize(path_or_internal)
|
|
6
|
+
if path_or_internal.is_a?(String)
|
|
7
|
+
@inner = InternalTokenizer.from_file(path_or_internal)
|
|
8
|
+
else
|
|
9
|
+
@inner = path_or_internal
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
5
13
|
def self.from_pretrained(identifier)
|
|
6
14
|
new(InternalTokenizer.from_pretrained(identifier))
|
|
7
15
|
end
|
|
@@ -11,7 +19,13 @@ module TokenizerRuby
|
|
|
11
19
|
end
|
|
12
20
|
|
|
13
21
|
def encode(text)
|
|
14
|
-
|
|
22
|
+
raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String)
|
|
23
|
+
|
|
24
|
+
begin
|
|
25
|
+
result = @inner._encode(text)
|
|
26
|
+
rescue => e
|
|
27
|
+
raise TokenizerRuby::Error, "failed to encode text: #{e.message}"
|
|
28
|
+
end
|
|
15
29
|
Encoding.new(
|
|
16
30
|
ids: result[:ids],
|
|
17
31
|
tokens: result[:tokens],
|
|
@@ -21,7 +35,13 @@ module TokenizerRuby
|
|
|
21
35
|
end
|
|
22
36
|
|
|
23
37
|
def decode(ids)
|
|
24
|
-
|
|
38
|
+
raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array)
|
|
39
|
+
|
|
40
|
+
begin
|
|
41
|
+
@inner._decode(ids)
|
|
42
|
+
rescue => e
|
|
43
|
+
raise TokenizerRuby::Error, "failed to decode ids: #{e.message}"
|
|
44
|
+
end
|
|
25
45
|
end
|
|
26
46
|
|
|
27
47
|
def encode_batch(texts)
|
|
@@ -57,6 +77,8 @@ module TokenizerRuby
|
|
|
57
77
|
end
|
|
58
78
|
|
|
59
79
|
def truncate(text, max_tokens:)
|
|
80
|
+
raise TokenizerRuby::Error, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0
|
|
81
|
+
|
|
60
82
|
encoding = encode(text)
|
|
61
83
|
return text if encoding.length <= max_tokens
|
|
62
84
|
|
|
@@ -71,11 +93,5 @@ module TokenizerRuby
|
|
|
71
93
|
def enable_padding(length:, pad_token: "[PAD]")
|
|
72
94
|
@inner._enable_padding(length, pad_token)
|
|
73
95
|
end
|
|
74
|
-
|
|
75
|
-
private
|
|
76
|
-
|
|
77
|
-
def initialize(inner)
|
|
78
|
-
@inner = inner
|
|
79
|
-
end
|
|
80
96
|
end
|
|
81
97
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizer-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Johannes Dwi Cahyo
|
|
@@ -32,6 +32,7 @@ extra_rdoc_files: []
|
|
|
32
32
|
files:
|
|
33
33
|
- CLAUDE.md
|
|
34
34
|
- LICENSE
|
|
35
|
+
- README.md
|
|
35
36
|
- ext/tokenizer_ruby/Cargo.toml
|
|
36
37
|
- ext/tokenizer_ruby/extconf.rb
|
|
37
38
|
- ext/tokenizer_ruby/src/lib.rs
|