tokenizer-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1eea09dc0997579eab072e4fc3705f8e5e5161110126f80478922ce65732d6a5
4
- data.tar.gz: 4e6534b235e6477742a5170fc872cca3b59bbb44f32960fcb656a20d70d00c9f
3
+ metadata.gz: 0bd2566191cac59c7f91316990a1d2bf872cebd006c429de2bc34b5f217e37f0
4
+ data.tar.gz: aadf725da91f4976a0b393f3021288587e3da099cc9fae3c2a7f49f855db8a05
5
5
  SHA512:
6
- metadata.gz: 4b7f5258322ef3d76d3a16a889b0e233f0078ed4970fff3b06d63f9d47fe3aeb39690fe370a37d0e52faf08bf808919d2bfe415a22bafa5bc31f38a6bcf70642
7
- data.tar.gz: d2702b426ec82453372e9452a103921f025d68d44296e466c57e5b86d041a6c5c6b6558087a779767ac02080089a849cfee5262f863b0777331a42e51bb6b15a
6
+ metadata.gz: 19d5ca2f2cde59867f53f151ab819e73a18e3efc2cccd4c407ae71162b9689282219f5de51b9ffc017fb5dfdb9dc119acaaeb48f6714252341aafd1b509030d5
7
+ data.tar.gz: 7c8d23d28db0f55b9e9befab6fa55e111f308f87d56ffb72ce7fcb7192513b9b8f582bfcf8287fb3fb39a7c66abca70bb52e240f60aea07c95c18fbf77b687e4
data/CLAUDE.md CHANGED
@@ -174,7 +174,7 @@ Follow the same pattern as zvec-ruby:
174
174
 
175
175
  ## Publishing
176
176
 
177
- - RubyGems.org: `GEM_HOST_API_KEY=rubygems_5d46e91ceb51fb455e98a7f491a2321bb6879f9be35d6842 gem push tokenizer-ruby-*.gem`
177
+ - RubyGems.org: `GEM_HOST_API_KEY=rubygems_b63436a0904abc9d8c595793bcd53901397650205d0daaf0 gem push tokenizer-ruby-*.gem`
178
178
  - gem.coop: `GEM_HOST_API_KEY=hjncPswY8PbGDfLPw4RMj928 gem push tokenizer-ruby-*.gem --host https://beta.gem.coop/@johannesdwicahyo`
179
179
 
180
180
  ## Notes from zvec-ruby Experience
@@ -2,22 +2,22 @@ use magnus::{
2
2
  define_module, function, method, prelude::*, Error, RHash, Ruby,
3
3
  RArray,
4
4
  };
5
- use std::cell::RefCell;
5
+ use std::sync::Mutex;
6
6
  use tokenizers::Tokenizer;
7
7
 
8
8
  #[magnus::wrap(class = "TokenizerRuby::InternalTokenizer", free_immediately)]
9
- struct RubyTokenizer(RefCell<Tokenizer>);
9
+ struct RubyTokenizer(Mutex<Tokenizer>);
10
10
 
11
11
  fn from_pretrained(ruby: &Ruby, identifier: String) -> Result<RubyTokenizer, Error> {
12
12
  let tokenizer = Tokenizer::from_pretrained(&identifier, None)
13
13
  .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
14
- Ok(RubyTokenizer(RefCell::new(tokenizer)))
14
+ Ok(RubyTokenizer(Mutex::new(tokenizer)))
15
15
  }
16
16
 
17
17
  fn from_file(ruby: &Ruby, path: String) -> Result<RubyTokenizer, Error> {
18
18
  let tokenizer = Tokenizer::from_file(&path)
19
19
  .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
20
- Ok(RubyTokenizer(RefCell::new(tokenizer)))
20
+ Ok(RubyTokenizer(Mutex::new(tokenizer)))
21
21
  }
22
22
 
23
23
  fn encoding_to_hash(ruby: &Ruby, encoding: &tokenizers::Encoding) -> Result<RHash, Error> {
@@ -38,27 +38,51 @@ fn encoding_to_hash(ruby: &Ruby, encoding: &tokenizers::Encoding) -> Result<RHas
38
38
  let mask: Vec<i64> = encoding.get_attention_mask().iter().map(|&m| m as i64).collect();
39
39
  hash.aset(ruby.sym_new("attention_mask"), RArray::from_vec(mask))?;
40
40
 
41
+ // type_ids
42
+ let type_ids: Vec<i64> = encoding.get_type_ids().iter().map(|&t| t as i64).collect();
43
+ hash.aset(ruby.sym_new("type_ids"), RArray::from_vec(type_ids))?;
44
+
45
+ // special_tokens_mask
46
+ let special_mask: Vec<i64> = encoding.get_special_tokens_mask().iter().map(|&m| m as i64).collect();
47
+ hash.aset(ruby.sym_new("special_tokens_mask"), RArray::from_vec(special_mask))?;
48
+
49
+ // word_ids
50
+ let word_ids_array = RArray::new();
51
+ for word_id in encoding.get_word_ids() {
52
+ match word_id {
53
+ Some(id) => word_ids_array.push(*id as i64)?,
54
+ None => word_ids_array.push(ruby.qnil())?,
55
+ }
56
+ }
57
+ hash.aset(ruby.sym_new("word_ids"), word_ids_array)?;
58
+
41
59
  Ok(hash)
42
60
  }
43
61
 
44
62
  impl RubyTokenizer {
45
- fn encode(ruby: &Ruby, rb_self: &Self, text: String) -> Result<RHash, Error> {
46
- let encoding = rb_self.0.borrow()
47
- .encode(text.as_str(), false)
63
+ fn encode(ruby: &Ruby, rb_self: &Self, text: String, add_special_tokens: bool) -> Result<RHash, Error> {
64
+ let guard = rb_self.0.lock()
65
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
66
+ let encoding = guard
67
+ .encode(text.as_str(), add_special_tokens)
48
68
  .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
49
69
  encoding_to_hash(ruby, &encoding)
50
70
  }
51
71
 
52
- fn decode(ruby: &Ruby, rb_self: &Self, ids: Vec<u32>) -> Result<String, Error> {
53
- rb_self.0.borrow()
54
- .decode(&ids, true)
72
+ fn decode(ruby: &Ruby, rb_self: &Self, ids: Vec<u32>, skip_special_tokens: bool) -> Result<String, Error> {
73
+ let guard = rb_self.0.lock()
74
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
75
+ guard
76
+ .decode(&ids, skip_special_tokens)
55
77
  .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))
56
78
  }
57
79
 
58
- fn encode_batch(ruby: &Ruby, rb_self: &Self, texts: Vec<String>) -> Result<RArray, Error> {
80
+ fn encode_batch(ruby: &Ruby, rb_self: &Self, texts: Vec<String>, add_special_tokens: bool) -> Result<RArray, Error> {
59
81
  let inputs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
60
- let encodings = rb_self.0.borrow()
61
- .encode_batch(inputs, false)
82
+ let guard = rb_self.0.lock()
83
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
84
+ let encodings = guard
85
+ .encode_batch(inputs, add_special_tokens)
62
86
  .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))?;
63
87
  let result = RArray::new();
64
88
  for encoding in &encodings {
@@ -67,40 +91,68 @@ impl RubyTokenizer {
67
91
  Ok(result)
68
92
  }
69
93
 
70
- fn decode_batch(ruby: &Ruby, rb_self: &Self, ids_array: Vec<Vec<u32>>) -> Result<Vec<String>, Error> {
94
+ fn decode_batch(ruby: &Ruby, rb_self: &Self, ids_array: Vec<Vec<u32>>, skip_special_tokens: bool) -> Result<Vec<String>, Error> {
71
95
  let refs: Vec<&[u32]> = ids_array.iter().map(|v| v.as_slice()).collect();
72
- rb_self.0.borrow()
73
- .decode_batch(&refs, true)
96
+ let guard = rb_self.0.lock()
97
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
98
+ guard
99
+ .decode_batch(&refs, skip_special_tokens)
74
100
  .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("{}", e)))
75
101
  }
76
102
 
77
- fn vocab_size(&self) -> usize {
78
- self.0.borrow().get_vocab_size(true)
103
+ fn vocab_size(ruby: &Ruby, rb_self: &Self) -> Result<usize, Error> {
104
+ let guard = rb_self.0.lock()
105
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
106
+ Ok(guard.get_vocab_size(true))
79
107
  }
80
108
 
81
- fn token_to_id(&self, token: String) -> Option<u32> {
82
- self.0.borrow().token_to_id(&token)
109
+ fn token_to_id(ruby: &Ruby, rb_self: &Self, token: String) -> Result<Option<u32>, Error> {
110
+ let guard = rb_self.0.lock()
111
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
112
+ Ok(guard.token_to_id(&token))
83
113
  }
84
114
 
85
- fn id_to_token(&self, id: u32) -> Option<String> {
86
- self.0.borrow().id_to_token(id)
115
+ fn id_to_token(ruby: &Ruby, rb_self: &Self, id: u32) -> Result<Option<String>, Error> {
116
+ let guard = rb_self.0.lock()
117
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
118
+ Ok(guard.id_to_token(id))
87
119
  }
88
120
 
89
- fn enable_truncation(&self, max_length: usize) {
121
+ fn enable_truncation(ruby: &Ruby, rb_self: &Self, max_length: usize) -> Result<(), Error> {
90
122
  let params = tokenizers::TruncationParams {
91
123
  max_length,
92
124
  ..Default::default()
93
125
  };
94
- let _ = self.0.borrow_mut().with_truncation(Some(params));
126
+ let mut guard = rb_self.0.lock()
127
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
128
+ let _ = guard.with_truncation(Some(params));
129
+ Ok(())
95
130
  }
96
131
 
97
- fn enable_padding(&self, length: usize, pad_token: String) {
132
+ fn disable_truncation(ruby: &Ruby, rb_self: &Self) -> Result<(), Error> {
133
+ let mut guard = rb_self.0.lock()
134
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
135
+ let _ = guard.with_truncation(None);
136
+ Ok(())
137
+ }
138
+
139
+ fn enable_padding(ruby: &Ruby, rb_self: &Self, length: usize, pad_token: String) -> Result<(), Error> {
98
140
  let params = tokenizers::PaddingParams {
99
141
  strategy: tokenizers::PaddingStrategy::Fixed(length),
100
142
  pad_token,
101
143
  ..Default::default()
102
144
  };
103
- self.0.borrow_mut().with_padding(Some(params));
145
+ let mut guard = rb_self.0.lock()
146
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
147
+ guard.with_padding(Some(params));
148
+ Ok(())
149
+ }
150
+
151
+ fn disable_padding(ruby: &Ruby, rb_self: &Self) -> Result<(), Error> {
152
+ let mut guard = rb_self.0.lock()
153
+ .map_err(|e| Error::new(ruby.exception_runtime_error(), format!("lock poisoned: {}", e)))?;
154
+ guard.with_padding(None);
155
+ Ok(())
104
156
  }
105
157
  }
106
158
 
@@ -111,15 +163,17 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
111
163
  let class = module.define_class("InternalTokenizer", ruby.class_object())?;
112
164
  class.define_singleton_method("from_pretrained", function!(from_pretrained, 1))?;
113
165
  class.define_singleton_method("from_file", function!(from_file, 1))?;
114
- class.define_method("_encode", method!(RubyTokenizer::encode, 1))?;
115
- class.define_method("_decode", method!(RubyTokenizer::decode, 1))?;
116
- class.define_method("_encode_batch", method!(RubyTokenizer::encode_batch, 1))?;
117
- class.define_method("_decode_batch", method!(RubyTokenizer::decode_batch, 1))?;
166
+ class.define_method("_encode", method!(RubyTokenizer::encode, 2))?;
167
+ class.define_method("_decode", method!(RubyTokenizer::decode, 2))?;
168
+ class.define_method("_encode_batch", method!(RubyTokenizer::encode_batch, 2))?;
169
+ class.define_method("_decode_batch", method!(RubyTokenizer::decode_batch, 2))?;
118
170
  class.define_method("vocab_size", method!(RubyTokenizer::vocab_size, 0))?;
119
171
  class.define_method("token_to_id", method!(RubyTokenizer::token_to_id, 1))?;
120
172
  class.define_method("id_to_token", method!(RubyTokenizer::id_to_token, 1))?;
121
173
  class.define_method("_enable_truncation", method!(RubyTokenizer::enable_truncation, 1))?;
174
+ class.define_method("_disable_truncation", method!(RubyTokenizer::disable_truncation, 0))?;
122
175
  class.define_method("_enable_padding", method!(RubyTokenizer::enable_padding, 2))?;
176
+ class.define_method("_disable_padding", method!(RubyTokenizer::disable_padding, 0))?;
123
177
 
124
178
  Ok(())
125
179
  }
@@ -18,46 +18,52 @@ module TokenizerRuby
18
18
  new(InternalTokenizer.from_file(path))
19
19
  end
20
20
 
21
- def encode(text)
21
+ def encode(text, add_special_tokens: false)
22
22
  raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String)
23
23
 
24
24
  begin
25
- result = @inner._encode(text)
25
+ result = @inner._encode(text, add_special_tokens)
26
26
  rescue => e
27
- raise TokenizerRuby::Error, "failed to encode text: #{e.message}"
27
+ raise TokenizerRuby::TokenizationError, "failed to encode text: #{e.message}"
28
28
  end
29
29
  Encoding.new(
30
30
  ids: result[:ids],
31
31
  tokens: result[:tokens],
32
32
  offsets: result[:offsets],
33
- attention_mask: result[:attention_mask]
33
+ attention_mask: result[:attention_mask],
34
+ type_ids: result[:type_ids],
35
+ special_tokens_mask: result[:special_tokens_mask],
36
+ word_ids: result[:word_ids]
34
37
  )
35
38
  end
36
39
 
37
- def decode(ids)
40
+ def decode(ids, skip_special_tokens: true)
38
41
  raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array)
39
42
 
40
43
  begin
41
- @inner._decode(ids)
44
+ @inner._decode(ids, skip_special_tokens)
42
45
  rescue => e
43
- raise TokenizerRuby::Error, "failed to decode ids: #{e.message}"
46
+ raise TokenizerRuby::TokenizationError, "failed to decode ids: #{e.message}"
44
47
  end
45
48
  end
46
49
 
47
- def encode_batch(texts)
48
- results = @inner._encode_batch(texts)
50
+ def encode_batch(texts, add_special_tokens: false)
51
+ results = @inner._encode_batch(texts, add_special_tokens)
49
52
  results.map do |result|
50
53
  Encoding.new(
51
54
  ids: result[:ids],
52
55
  tokens: result[:tokens],
53
56
  offsets: result[:offsets],
54
- attention_mask: result[:attention_mask]
57
+ attention_mask: result[:attention_mask],
58
+ type_ids: result[:type_ids],
59
+ special_tokens_mask: result[:special_tokens_mask],
60
+ word_ids: result[:word_ids]
55
61
  )
56
62
  end
57
63
  end
58
64
 
59
- def decode_batch(ids_array)
60
- @inner._decode_batch(ids_array)
65
+ def decode_batch(ids_array, skip_special_tokens: true)
66
+ @inner._decode_batch(ids_array, skip_special_tokens)
61
67
  end
62
68
 
63
69
  def vocab_size
@@ -77,7 +83,7 @@ module TokenizerRuby
77
83
  end
78
84
 
79
85
  def truncate(text, max_tokens:)
80
- raise TokenizerRuby::Error, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0
86
+ raise TokenizerRuby::ConfigurationError, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0
81
87
 
82
88
  encoding = encode(text)
83
89
  return text if encoding.length <= max_tokens
@@ -90,8 +96,16 @@ module TokenizerRuby
90
96
  @inner._enable_truncation(max_length)
91
97
  end
92
98
 
99
+ def disable_truncation
100
+ @inner._disable_truncation
101
+ end
102
+
93
103
  def enable_padding(length:, pad_token: "[PAD]")
94
104
  @inner._enable_padding(length, pad_token)
95
105
  end
106
+
107
+ def disable_padding
108
+ @inner._disable_padding
109
+ end
96
110
  end
97
111
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module TokenizerRuby
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
@@ -13,4 +13,7 @@ end
13
13
 
14
14
  module TokenizerRuby
15
15
  class Error < StandardError; end
16
+ class TokenizationError < Error; end
17
+ class FileNotFoundError < Error; end
18
+ class ConfigurationError < Error; end
16
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizer-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo