tokenizer-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +1 -1
- data/ext/tokenizer_ruby/src/lib.rs +84 -30
- data/lib/tokenizer_ruby/tokenizer.rb +27 -13
- data/lib/tokenizer_ruby/version.rb +1 -1
- data/lib/tokenizer_ruby.rb +3 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0bd2566191cac59c7f91316990a1d2bf872cebd006c429de2bc34b5f217e37f0
|
|
4
|
+
data.tar.gz: aadf725da91f4976a0b393f3021288587e3da099cc9fae3c2a7f49f855db8a05
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 19d5ca2f2cde59867f53f151ab819e73a18e3efc2cccd4c407ae71162b9689282219f5de51b9ffc017fb5dfdb9dc119acaaeb48f6714252341aafd1b509030d5
|
|
7
|
+
data.tar.gz: 7c8d23d28db0f55b9e9befab6fa55e111f308f87d56ffb72ce7fcb7192513b9b8f582bfcf8287fb3fb39a7c66abca70bb52e240f60aea07c95c18fbf77b687e4
|
data/CLAUDE.md
CHANGED
|
@@ -174,7 +174,7 @@ Follow the same pattern as zvec-ruby:
|
|
|
174
174
|
|
|
175
175
|
## Publishing
|
|
176
176
|
|
|
177
|
-
- RubyGems.org: `GEM_HOST_API_KEY=
|
|
177
|
+
- RubyGems.org: `GEM_HOST_API_KEY=rubygems_b63436a0904abc9d8c595793bcd53901397650205d0daaf0 gem push tokenizer-ruby-*.gem`
|
|
178
178
|
- gem.coop: `GEM_HOST_API_KEY=hjncPswY8PbGDfLPw4RMj928 gem push tokenizer-ruby-*.gem --host https://beta.gem.coop/@johannesdwicahyo`
|
|
179
179
|
|
|
180
180
|
## Notes from zvec-ruby Experience
|
|
@@ -2,22 +2,22 @@ use magnus::{
|
|
|
2
2
|
define_module, function, method, prelude::*, Error, RHash, Ruby,
|
|
3
3
|
RArray,
|
|
4
4
|
};
|
|
5
|
-
use std::
|
|
5
|
+
use std::sync::Mutex;
|
|
6
6
|
use tokenizers::Tokenizer;
|
|
7
7
|
|
|
8
8
|
#[magnus::wrap(class = "TokenizerRuby::InternalTokenizer", free_immediately)]
|
|
9
|
-
struct RubyTokenizer(
|
|
9
|
+
struct RubyTokenizer(Mutex<Tokenizer>);
|
|
10
10
|
|
|
11
11
|
fn from_pretrained(ruby: &Ruby, identifier: String) -> Result<RubyTokenizer, Error> {
|
|
12
12
|
let tokenizer = Tokenizer::from_pretrained(&identifier, None)
|
|
13
13
|
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
14
|
-
Ok(RubyTokenizer(
|
|
14
|
+
Ok(RubyTokenizer(Mutex::new(tokenizer)))
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
fn from_file(ruby: &Ruby, path: String) -> Result<RubyTokenizer, Error> {
|
|
18
18
|
let tokenizer = Tokenizer::from_file(&path)
|
|
19
19
|
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
20
|
-
Ok(RubyTokenizer(
|
|
20
|
+
Ok(RubyTokenizer(Mutex::new(tokenizer)))
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
fn encoding_to_hash(ruby: &Ruby, encoding: &tokenizers::Encoding) -> Result<RHash, Error> {
|
|
@@ -38,27 +38,51 @@ fn encoding_to_hash(ruby: &Ruby, encoding: &tokenizers::Encoding) -> Result<RHas
|
|
|
38
38
|
let mask: Vec<i64> = encoding.get_attention_mask().iter().map(|&m| m as i64).collect();
|
|
39
39
|
hash.aset(ruby.sym_new("attention_mask"), RArray::from_vec(mask))?;
|
|
40
40
|
|
|
41
|
+
// type_ids
|
|
42
|
+
let type_ids: Vec<i64> = encoding.get_type_ids().iter().map(|&t| t as i64).collect();
|
|
43
|
+
hash.aset(ruby.sym_new("type_ids"), RArray::from_vec(type_ids))?;
|
|
44
|
+
|
|
45
|
+
// special_tokens_mask
|
|
46
|
+
let special_mask: Vec<i64> = encoding.get_special_tokens_mask().iter().map(|&m| m as i64).collect();
|
|
47
|
+
hash.aset(ruby.sym_new("special_tokens_mask"), RArray::from_vec(special_mask))?;
|
|
48
|
+
|
|
49
|
+
// word_ids
|
|
50
|
+
let word_ids_array = RArray::new();
|
|
51
|
+
for word_id in encoding.get_word_ids() {
|
|
52
|
+
match word_id {
|
|
53
|
+
Some(id) => word_ids_array.push(*id as i64)?,
|
|
54
|
+
None => word_ids_array.push(ruby.qnil())?,
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
hash.aset(ruby.sym_new("word_ids"), word_ids_array)?;
|
|
58
|
+
|
|
41
59
|
Ok(hash)
|
|
42
60
|
}
|
|
43
61
|
|
|
44
62
|
impl RubyTokenizer {
|
|
45
|
-
fn encode(ruby: &Ruby, rb_self: &Self, text: String) -> Result<RHash, Error> {
|
|
46
|
-
let
|
|
47
|
-
.
|
|
63
|
+
fn encode(ruby: &Ruby, rb_self: &Self, text: String, add_special_tokens: bool) -> Result<RHash, Error> {
|
|
64
|
+
let guard = rb_self.0.lock()
|
|
65
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
66
|
+
let encoding = guard
|
|
67
|
+
.encode(text.as_str(), add_special_tokens)
|
|
48
68
|
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
49
69
|
encoding_to_hash(ruby, &encoding)
|
|
50
70
|
}
|
|
51
71
|
|
|
52
|
-
fn decode(ruby: &Ruby, rb_self: &Self, ids: Vec<u32
|
|
53
|
-
rb_self.0.
|
|
54
|
-
.
|
|
72
|
+
fn decode(ruby: &Ruby, rb_self: &Self, ids: Vec<u32>, skip_special_tokens: bool) -> Result<String, Error> {
|
|
73
|
+
let guard = rb_self.0.lock()
|
|
74
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
75
|
+
guard
|
|
76
|
+
.decode(&ids, skip_special_tokens)
|
|
55
77
|
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))
|
|
56
78
|
}
|
|
57
79
|
|
|
58
|
-
fn encode_batch(ruby: &Ruby, rb_self: &Self, texts: Vec<String
|
|
80
|
+
fn encode_batch(ruby: &Ruby, rb_self: &Self, texts: Vec<String>, add_special_tokens: bool) -> Result<RArray, Error> {
|
|
59
81
|
let inputs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
|
|
60
|
-
let
|
|
61
|
-
.
|
|
82
|
+
let guard = rb_self.0.lock()
|
|
83
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
84
|
+
let encodings = guard
|
|
85
|
+
.encode_batch(inputs, add_special_tokens)
|
|
62
86
|
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
|
|
63
87
|
let result = RArray::new();
|
|
64
88
|
for encoding in &encodings {
|
|
@@ -67,40 +91,68 @@ impl RubyTokenizer {
|
|
|
67
91
|
Ok(result)
|
|
68
92
|
}
|
|
69
93
|
|
|
70
|
-
fn decode_batch(ruby: &Ruby, rb_self: &Self, ids_array: Vec<Vec<u32
|
|
94
|
+
fn decode_batch(ruby: &Ruby, rb_self: &Self, ids_array: Vec<Vec<u32>>, skip_special_tokens: bool) -> Result<Vec<String>, Error> {
|
|
71
95
|
let refs: Vec<&[u32]> = ids_array.iter().map(|v| v.as_slice()).collect();
|
|
72
|
-
rb_self.0.
|
|
73
|
-
.
|
|
96
|
+
let guard = rb_self.0.lock()
|
|
97
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
98
|
+
guard
|
|
99
|
+
.decode_batch(&refs, skip_special_tokens)
|
|
74
100
|
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))
|
|
75
101
|
}
|
|
76
102
|
|
|
77
|
-
fn vocab_size(&
|
|
78
|
-
|
|
103
|
+
fn vocab_size(ruby: &Ruby, rb_self: &Self) -> Result<usize, Error> {
|
|
104
|
+
let guard = rb_self.0.lock()
|
|
105
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
106
|
+
Ok(guard.get_vocab_size(true))
|
|
79
107
|
}
|
|
80
108
|
|
|
81
|
-
fn token_to_id(&
|
|
82
|
-
|
|
109
|
+
fn token_to_id(ruby: &Ruby, rb_self: &Self, token: String) -> Result<Option<u32>, Error> {
|
|
110
|
+
let guard = rb_self.0.lock()
|
|
111
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
112
|
+
Ok(guard.token_to_id(&token))
|
|
83
113
|
}
|
|
84
114
|
|
|
85
|
-
fn id_to_token(&
|
|
86
|
-
|
|
115
|
+
fn id_to_token(ruby: &Ruby, rb_self: &Self, id: u32) -> Result<Option<String>, Error> {
|
|
116
|
+
let guard = rb_self.0.lock()
|
|
117
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
118
|
+
Ok(guard.id_to_token(id))
|
|
87
119
|
}
|
|
88
120
|
|
|
89
|
-
fn enable_truncation(&
|
|
121
|
+
fn enable_truncation(ruby: &Ruby, rb_self: &Self, max_length: usize) -> Result<(), Error> {
|
|
90
122
|
let params = tokenizers::TruncationParams {
|
|
91
123
|
max_length,
|
|
92
124
|
..Default::default()
|
|
93
125
|
};
|
|
94
|
-
let
|
|
126
|
+
let mut guard = rb_self.0.lock()
|
|
127
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
128
|
+
let _ = guard.with_truncation(Some(params));
|
|
129
|
+
Ok(())
|
|
95
130
|
}
|
|
96
131
|
|
|
97
|
-
fn
|
|
132
|
+
fn disable_truncation(ruby: &Ruby, rb_self: &Self) -> Result<(), Error> {
|
|
133
|
+
let mut guard = rb_self.0.lock()
|
|
134
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
135
|
+
let _ = guard.with_truncation(None);
|
|
136
|
+
Ok(())
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
fn enable_padding(ruby: &Ruby, rb_self: &Self, length: usize, pad_token: String) -> Result<(), Error> {
|
|
98
140
|
let params = tokenizers::PaddingParams {
|
|
99
141
|
strategy: tokenizers::PaddingStrategy::Fixed(length),
|
|
100
142
|
pad_token,
|
|
101
143
|
..Default::default()
|
|
102
144
|
};
|
|
103
|
-
|
|
145
|
+
let mut guard = rb_self.0.lock()
|
|
146
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
147
|
+
guard.with_padding(Some(params));
|
|
148
|
+
Ok(())
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
fn disable_padding(ruby: &Ruby, rb_self: &Self) -> Result<(), Error> {
|
|
152
|
+
let mut guard = rb_self.0.lock()
|
|
153
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
|
|
154
|
+
guard.with_padding(None);
|
|
155
|
+
Ok(())
|
|
104
156
|
}
|
|
105
157
|
}
|
|
106
158
|
|
|
@@ -111,15 +163,17 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
111
163
|
let class = module.define_class("InternalTokenizer", ruby.class_object())?;
|
|
112
164
|
class.define_singleton_method("from_pretrained", function!(from_pretrained, 1))?;
|
|
113
165
|
class.define_singleton_method("from_file", function!(from_file, 1))?;
|
|
114
|
-
class.define_method("_encode", method!(RubyTokenizer::encode,
|
|
115
|
-
class.define_method("_decode", method!(RubyTokenizer::decode,
|
|
116
|
-
class.define_method("_encode_batch", method!(RubyTokenizer::encode_batch,
|
|
117
|
-
class.define_method("_decode_batch", method!(RubyTokenizer::decode_batch,
|
|
166
|
+
class.define_method("_encode", method!(RubyTokenizer::encode, 2))?;
|
|
167
|
+
class.define_method("_decode", method!(RubyTokenizer::decode, 2))?;
|
|
168
|
+
class.define_method("_encode_batch", method!(RubyTokenizer::encode_batch, 2))?;
|
|
169
|
+
class.define_method("_decode_batch", method!(RubyTokenizer::decode_batch, 2))?;
|
|
118
170
|
class.define_method("vocab_size", method!(RubyTokenizer::vocab_size, 0))?;
|
|
119
171
|
class.define_method("token_to_id", method!(RubyTokenizer::token_to_id, 1))?;
|
|
120
172
|
class.define_method("id_to_token", method!(RubyTokenizer::id_to_token, 1))?;
|
|
121
173
|
class.define_method("_enable_truncation", method!(RubyTokenizer::enable_truncation, 1))?;
|
|
174
|
+
class.define_method("_disable_truncation", method!(RubyTokenizer::disable_truncation, 0))?;
|
|
122
175
|
class.define_method("_enable_padding", method!(RubyTokenizer::enable_padding, 2))?;
|
|
176
|
+
class.define_method("_disable_padding", method!(RubyTokenizer::disable_padding, 0))?;
|
|
123
177
|
|
|
124
178
|
Ok(())
|
|
125
179
|
}
|
|
@@ -18,46 +18,52 @@ module TokenizerRuby
|
|
|
18
18
|
new(InternalTokenizer.from_file(path))
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
-
def encode(text)
|
|
21
|
+
def encode(text, add_special_tokens: false)
|
|
22
22
|
raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String)
|
|
23
23
|
|
|
24
24
|
begin
|
|
25
|
-
result = @inner._encode(text)
|
|
25
|
+
result = @inner._encode(text, add_special_tokens)
|
|
26
26
|
rescue => e
|
|
27
|
-
raise TokenizerRuby::
|
|
27
|
+
raise TokenizerRuby::TokenizationError, "failed to encode text: #{e.message}"
|
|
28
28
|
end
|
|
29
29
|
Encoding.new(
|
|
30
30
|
ids: result[:ids],
|
|
31
31
|
tokens: result[:tokens],
|
|
32
32
|
offsets: result[:offsets],
|
|
33
|
-
attention_mask: result[:attention_mask]
|
|
33
|
+
attention_mask: result[:attention_mask],
|
|
34
|
+
type_ids: result[:type_ids],
|
|
35
|
+
special_tokens_mask: result[:special_tokens_mask],
|
|
36
|
+
word_ids: result[:word_ids]
|
|
34
37
|
)
|
|
35
38
|
end
|
|
36
39
|
|
|
37
|
-
def decode(ids)
|
|
40
|
+
def decode(ids, skip_special_tokens: true)
|
|
38
41
|
raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array)
|
|
39
42
|
|
|
40
43
|
begin
|
|
41
|
-
@inner._decode(ids)
|
|
44
|
+
@inner._decode(ids, skip_special_tokens)
|
|
42
45
|
rescue => e
|
|
43
|
-
raise TokenizerRuby::
|
|
46
|
+
raise TokenizerRuby::TokenizationError, "failed to decode ids: #{e.message}"
|
|
44
47
|
end
|
|
45
48
|
end
|
|
46
49
|
|
|
47
|
-
def encode_batch(texts)
|
|
48
|
-
results = @inner._encode_batch(texts)
|
|
50
|
+
def encode_batch(texts, add_special_tokens: false)
|
|
51
|
+
results = @inner._encode_batch(texts, add_special_tokens)
|
|
49
52
|
results.map do |result|
|
|
50
53
|
Encoding.new(
|
|
51
54
|
ids: result[:ids],
|
|
52
55
|
tokens: result[:tokens],
|
|
53
56
|
offsets: result[:offsets],
|
|
54
|
-
attention_mask: result[:attention_mask]
|
|
57
|
+
attention_mask: result[:attention_mask],
|
|
58
|
+
type_ids: result[:type_ids],
|
|
59
|
+
special_tokens_mask: result[:special_tokens_mask],
|
|
60
|
+
word_ids: result[:word_ids]
|
|
55
61
|
)
|
|
56
62
|
end
|
|
57
63
|
end
|
|
58
64
|
|
|
59
|
-
def decode_batch(ids_array)
|
|
60
|
-
@inner._decode_batch(ids_array)
|
|
65
|
+
def decode_batch(ids_array, skip_special_tokens: true)
|
|
66
|
+
@inner._decode_batch(ids_array, skip_special_tokens)
|
|
61
67
|
end
|
|
62
68
|
|
|
63
69
|
def vocab_size
|
|
@@ -77,7 +83,7 @@ module TokenizerRuby
|
|
|
77
83
|
end
|
|
78
84
|
|
|
79
85
|
def truncate(text, max_tokens:)
|
|
80
|
-
raise TokenizerRuby::
|
|
86
|
+
raise TokenizerRuby::ConfigurationError, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0
|
|
81
87
|
|
|
82
88
|
encoding = encode(text)
|
|
83
89
|
return text if encoding.length <= max_tokens
|
|
@@ -90,8 +96,16 @@ module TokenizerRuby
|
|
|
90
96
|
@inner._enable_truncation(max_length)
|
|
91
97
|
end
|
|
92
98
|
|
|
99
|
+
def disable_truncation
|
|
100
|
+
@inner._disable_truncation
|
|
101
|
+
end
|
|
102
|
+
|
|
93
103
|
def enable_padding(length:, pad_token: "[PAD]")
|
|
94
104
|
@inner._enable_padding(length, pad_token)
|
|
95
105
|
end
|
|
106
|
+
|
|
107
|
+
def disable_padding
|
|
108
|
+
@inner._disable_padding
|
|
109
|
+
end
|
|
96
110
|
end
|
|
97
111
|
end
|
data/lib/tokenizer_ruby.rb
CHANGED