tokenizers 0.4.4 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,21 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.4"
3
+ version = "0.5.1"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
7
- rust-version = "1.62.0"
7
+ rust-version = "1.63.0"
8
8
  publish = false
9
9
 
10
10
  [lib]
11
11
  crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
- magnus = "0.6"
14
+ magnus = "0.7"
15
15
  onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.2" # also update in from_pretrained.rb
19
+ version = "=0.20.0" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -1,5 +1,6 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
+ use crate::pre_tokenizers::from_string;
3
4
  use magnus::value::Lazy;
4
5
  use magnus::{
5
6
  data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
11
12
  use tk::decoders::byte_level::ByteLevel;
12
13
  use tk::decoders::ctc::CTC;
13
14
  use tk::decoders::fuse::Fuse;
14
- use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::metaspace::{Metaspace, PrependScheme};
15
16
  use tk::decoders::strip::Strip;
16
17
  use tk::decoders::wordpiece::WordPiece;
17
18
  use tk::decoders::DecoderWrapper;
@@ -126,12 +127,29 @@ impl RbDecoder {
126
127
  setter!(self, Metaspace, @set_replacement, replacement);
127
128
  }
128
129
 
129
- pub fn metaspace_add_prefix_space(&self) -> bool {
130
- getter!(self, Metaspace, add_prefix_space)
130
+ pub fn metaspace_split(&self) -> bool {
131
+ getter!(self, Metaspace, get_split())
131
132
  }
132
133
 
133
- pub fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
134
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
134
+ pub fn metaspace_set_split(&self, split: bool) {
135
+ setter!(self, Metaspace, @set_split, split);
136
+ }
137
+
138
+ pub fn metaspace_prepend_scheme(&self) -> String {
139
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
140
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
141
+ match scheme {
142
+ PrependScheme::First => "first",
143
+ PrependScheme::Never => "never",
144
+ PrependScheme::Always => "always",
145
+ }
146
+ .to_string()
147
+ }
148
+
149
+ pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
150
+ let scheme = from_string(prepend_scheme)?;
151
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
152
+ Ok(())
135
153
  }
136
154
 
137
155
  pub fn word_piece_cleanup(&self) -> bool {
@@ -194,8 +212,9 @@ impl RbFuse {
194
212
  pub struct RbMetaspaceDecoder {}
195
213
 
196
214
  impl RbMetaspaceDecoder {
197
- pub fn new(replacement: char, add_prefix_space: bool) -> RbDecoder {
198
- Metaspace::new(replacement, add_prefix_space).into()
215
+ pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
216
+ let prepend_scheme = from_string(prepend_scheme)?;
217
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
199
218
  }
200
219
  }
201
220
 
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
364
383
  class.define_singleton_method("new", function!(RbFuse::new, 0))?;
365
384
 
366
385
  let class = module.define_class("Metaspace", decoder)?;
367
- class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
368
- class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
369
- class.define_method("add_prefix_space=", method!(RbDecoder::metaspace_set_add_prefix_space, 1))?;
386
+ class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
387
+ class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
388
+ class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
370
389
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
371
390
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
391
+ class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
392
+ class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
372
393
 
373
394
  let class = module.define_class("Replace", decoder)?;
374
395
  class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
@@ -222,8 +222,8 @@ pub struct RbSequence {}
222
222
  impl RbSequence {
223
223
  fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
224
224
  let mut sequence = Vec::with_capacity(normalizers.len());
225
- for n in normalizers.each() {
226
- let normalizer: &RbNormalizer = TryConvert::try_convert(n?)?;
225
+ for n in normalizers.into_iter() {
226
+ let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
227
227
  match &normalizer.normalizer {
228
228
  RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
229
229
  RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
@@ -1,7 +1,7 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
3
  use magnus::{
4
- data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
4
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
5
5
  RArray, RClass, RModule, Ruby, TryConvert, TypedData,
6
6
  };
7
7
 
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
12
12
  use tk::pre_tokenizers::byte_level::ByteLevel;
13
13
  use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
14
14
  use tk::pre_tokenizers::digits::Digits;
15
- use tk::pre_tokenizers::metaspace::Metaspace;
15
+ use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
16
16
  use tk::pre_tokenizers::punctuation::Punctuation;
17
17
  use tk::pre_tokenizers::split::Split;
18
18
  use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
118
118
  setter!(self, Digits, individual_digits, individual_digits);
119
119
  }
120
120
 
121
- fn metaspace_add_prefix_space(&self) -> bool {
122
- getter!(self, Metaspace, add_prefix_space)
123
- }
124
-
125
- fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
126
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
127
- }
128
-
129
121
  fn metaspace_replacement(&self) -> String {
130
122
  getter!(self, Metaspace, get_replacement().to_string())
131
123
  }
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
133
125
  fn metaspace_set_replacement(&self, replacement: char) {
134
126
  setter!(self, Metaspace, @set_replacement, replacement);
135
127
  }
128
+
129
+ fn metaspace_split(&self) -> bool {
130
+ getter!(self, Metaspace, get_split())
131
+ }
132
+
133
+ fn metaspace_set_split(&self, split: bool) {
134
+ setter!(self, Metaspace, @set_split, split);
135
+ }
136
+
137
+ fn metaspace_prepend_scheme(&self) -> String {
138
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
139
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
140
+ match scheme {
141
+ PrependScheme::First => "first",
142
+ PrependScheme::Never => "never",
143
+ PrependScheme::Always => "always",
144
+ }
145
+ .to_string()
146
+ }
147
+
148
+ fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
149
+ let scheme = from_string(prepend_scheme)?;
150
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
151
+ Ok(())
152
+ }
136
153
  }
137
154
 
138
155
  impl PreTokenizer for RbPreTokenizer {
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
180
197
  impl RbMetaspace {
181
198
  fn new(
182
199
  replacement: char,
183
- add_prefix_space: bool,
184
- ) -> RbPreTokenizer {
185
- Metaspace::new(replacement, add_prefix_space).into()
200
+ prepend_scheme: String,
201
+ split: bool,
202
+ ) -> RbResult<RbPreTokenizer> {
203
+ let prepend_scheme = from_string(prepend_scheme)?;
204
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
186
205
  }
187
206
  }
188
207
 
@@ -239,8 +258,8 @@ pub struct RbSequence {}
239
258
  impl RbSequence {
240
259
  fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
241
260
  let mut sequence = Vec::with_capacity(pre_tokenizers.len());
242
- for n in pre_tokenizers.each() {
243
- let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n?)?;
261
+ for n in pre_tokenizers.into_iter() {
262
+ let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
244
263
  match &pretokenizer.pretok {
245
264
  RbPreTokenizerTypeWrapper::Sequence(inner) => {
246
265
  sequence.extend(inner.iter().cloned())
@@ -252,6 +271,21 @@ impl RbSequence {
252
271
  }
253
272
  }
254
273
 
274
+ pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
275
+ let scheme = match string.as_str() {
276
+ "first" => PrependScheme::First,
277
+ "never" => PrependScheme::Never,
278
+ "always" => PrependScheme::Always,
279
+ _ => {
280
+ return Err(Error::new(exception::arg_error(), format!(
281
+ "{} is an unknown variant, should be one of ['first', 'never', 'always']",
282
+ string
283
+ )));
284
+ }
285
+ };
286
+ Ok(scheme)
287
+ }
288
+
255
289
  #[derive(Clone, Deserialize)]
256
290
  #[serde(untagged)]
257
291
  pub(crate) enum RbPreTokenizerWrapper {
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
465
499
  class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
466
500
 
467
501
  let class = module.define_class("Metaspace", pre_tokenizer)?;
468
- class.define_singleton_method("_new", function!(RbMetaspace::new, 2))?;
469
- class.define_method("add_prefix_space", method!(RbPreTokenizer::metaspace_add_prefix_space, 0))?;
470
- class.define_method("add_prefix_space=", method!(RbPreTokenizer::metaspace_set_add_prefix_space, 1))?;
502
+ class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
503
+ class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
504
+ class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
471
505
  class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
472
506
  class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
507
+ class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
508
+ class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
473
509
 
474
510
  let class = module.define_class("Punctuation", pre_tokenizer)?;
475
511
  class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
@@ -282,12 +282,12 @@ impl RbTokenizer {
282
282
  add_special_tokens: bool,
283
283
  ) -> RbResult<RArray> {
284
284
  let input: Vec<tk::EncodeInput> = input
285
- .each()
285
+ .into_iter()
286
286
  .map(|o| {
287
287
  let input: tk::EncodeInput = if is_pretokenized {
288
- PreTokenizedEncodeInput::try_convert(o?)?.into()
288
+ PreTokenizedEncodeInput::try_convert(o)?.into()
289
289
  } else {
290
- TextEncodeInput::try_convert(o?)?.into()
290
+ TextEncodeInput::try_convert(o)?.into()
291
291
  };
292
292
  Ok(input)
293
293
  })
@@ -319,26 +319,26 @@ impl RbTokenizer {
319
319
  .map_err(RbError::from)
320
320
  }
321
321
 
322
- pub fn set_decoder(&self, decoder: &RbDecoder) {
323
- self.tokenizer.borrow_mut().with_decoder(decoder.clone());
322
+ pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
323
+ self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
324
324
  }
325
325
 
326
- pub fn set_pre_tokenizer(&self, pretok: &RbPreTokenizer) {
326
+ pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
327
327
  self.tokenizer
328
328
  .borrow_mut()
329
- .with_pre_tokenizer(pretok.clone());
329
+ .with_pre_tokenizer(pretok.cloned());
330
330
  }
331
331
 
332
- pub fn set_post_processor(&self, processor: &RbPostProcessor) {
332
+ pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
333
333
  self.tokenizer
334
334
  .borrow_mut()
335
- .with_post_processor(processor.clone());
335
+ .with_post_processor(processor.cloned());
336
336
  }
337
337
 
338
- pub fn set_normalizer(&self, normalizer: &RbNormalizer) {
338
+ pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
339
339
  self.tokenizer
340
340
  .borrow_mut()
341
- .with_normalizer(normalizer.clone());
341
+ .with_normalizer(normalizer.cloned());
342
342
  }
343
343
 
344
344
  pub fn token_to_id(&self, token: String) -> Option<u32> {
@@ -110,9 +110,9 @@ impl RbTrainer {
110
110
  BpeTrainer,
111
111
  special_tokens,
112
112
  special_tokens
113
- .each()
113
+ .into_iter()
114
114
  .map(|token| {
115
- if let Ok(content) = String::try_convert(token?) {
115
+ if let Ok(content) = String::try_convert(token) {
116
116
  Ok(RbAddedToken::from(content, Some(true)).get_token())
117
117
  } else {
118
118
  todo!()
@@ -197,9 +197,9 @@ impl RbTrainer {
197
197
  UnigramTrainer,
198
198
  special_tokens,
199
199
  special_tokens
200
- .each()
200
+ .into_iter()
201
201
  .map(|token| {
202
- if let Ok(content) = String::try_convert(token?) {
202
+ if let Ok(content) = String::try_convert(token) {
203
203
  Ok(RbAddedToken::from(content, Some(true)).get_token())
204
204
  } else {
205
205
  todo!()
@@ -268,9 +268,9 @@ impl RbTrainer {
268
268
  WordLevelTrainer,
269
269
  special_tokens,
270
270
  special_tokens
271
- .each()
271
+ .into_iter()
272
272
  .map(|token| {
273
- if let Ok(content) = String::try_convert(token?) {
273
+ if let Ok(content) = String::try_convert(token) {
274
274
  Ok(RbAddedToken::from(content, Some(true)).get_token())
275
275
  } else {
276
276
  todo!()
@@ -322,9 +322,9 @@ impl RbTrainer {
322
322
  WordPieceTrainer,
323
323
  @set_special_tokens,
324
324
  special_tokens
325
- .each()
325
+ .into_iter()
326
326
  .map(|token| {
327
- if let Ok(content) = String::try_convert(token?) {
327
+ if let Ok(content) = String::try_convert(token) {
328
328
  Ok(RbAddedToken::from(content, Some(true)).get_token())
329
329
  } else {
330
330
  todo!()
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
398
398
  if !value.is_nil() {
399
399
  builder = builder.special_tokens(
400
400
  RArray::try_convert(value)?
401
- .each()
401
+ .into_iter()
402
402
  .map(|token| {
403
- if let Ok(content) = String::try_convert(token?) {
403
+ if let Ok(content) = String::try_convert(token) {
404
404
  Ok(RbAddedToken::from(content, Some(true)).get_token())
405
405
  } else {
406
406
  todo!()
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
466
466
  if !value.is_nil() {
467
467
  builder.special_tokens(
468
468
  RArray::try_convert(value)?
469
- .each()
469
+ .into_iter()
470
470
  .map(|token| {
471
- if let Ok(content) = String::try_convert(token?) {
471
+ if let Ok(content) = String::try_convert(token) {
472
472
  Ok(RbAddedToken::from(content, Some(true)).get_token())
473
473
  } else {
474
474
  todo!()
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
540
540
  if !value.is_nil() {
541
541
  builder.special_tokens(
542
542
  RArray::try_convert(value)?
543
- .each()
543
+ .into_iter()
544
544
  .map(|token| {
545
- if let Ok(content) = String::try_convert(token?) {
545
+ if let Ok(content) = String::try_convert(token) {
546
546
  Ok(RbAddedToken::from(content, Some(true)).get_token())
547
547
  } else {
548
548
  todo!()
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
581
581
  if !value.is_nil() {
582
582
  builder = builder.special_tokens(
583
583
  RArray::try_convert(value)?
584
- .each()
584
+ .into_iter()
585
585
  .map(|token| {
586
- if let Ok(content) = String::try_convert(token?) {
586
+ if let Ok(content) = String::try_convert(token) {
587
587
  Ok(RbAddedToken::from(content, Some(true)).get_token())
588
588
  } else {
589
589
  todo!()
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Decoders
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.2"
4
+ TOKENIZERS_VERSION = "0.20.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -67,7 +67,7 @@ module Tokenizers
67
67
  end
68
68
  end
69
69
 
70
- options[:content_length_proc] = -> (_) { puts "Downloading..." }
70
+ options[:content_length_proc] = ->(_) { puts "Downloading..." }
71
71
 
72
72
  # string options are headers
73
73
  tempfile = URI.parse(url).open(headers.merge(options))
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module PreTokenizers
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.4"
2
+ VERSION = "0.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-27 00:00:00.000000000 Z
11
+ date: 2024-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: '3'
96
+ version: '3.1'
97
97
  required_rubygems_version: !ruby/object:Gem::Requirement
98
98
  requirements:
99
99
  - - ">="
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.5.3
103
+ rubygems_version: 3.5.11
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby