tokenizers 0.4.4 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,21 +1,21 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.4"
3
+ version = "0.5.1"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
7
- rust-version = "1.62.0"
7
+ rust-version = "1.63.0"
8
8
  publish = false
9
9
 
10
10
  [lib]
11
11
  crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
- magnus = "0.6"
14
+ magnus = "0.7"
15
15
  onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.2" # also update in from_pretrained.rb
19
+ version = "=0.20.0" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -1,5 +1,6 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
+ use crate::pre_tokenizers::from_string;
3
4
  use magnus::value::Lazy;
4
5
  use magnus::{
5
6
  data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
11
12
  use tk::decoders::byte_level::ByteLevel;
12
13
  use tk::decoders::ctc::CTC;
13
14
  use tk::decoders::fuse::Fuse;
14
- use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::metaspace::{Metaspace, PrependScheme};
15
16
  use tk::decoders::strip::Strip;
16
17
  use tk::decoders::wordpiece::WordPiece;
17
18
  use tk::decoders::DecoderWrapper;
@@ -126,12 +127,29 @@ impl RbDecoder {
126
127
  setter!(self, Metaspace, @set_replacement, replacement);
127
128
  }
128
129
 
129
- pub fn metaspace_add_prefix_space(&self) -> bool {
130
- getter!(self, Metaspace, add_prefix_space)
130
+ pub fn metaspace_split(&self) -> bool {
131
+ getter!(self, Metaspace, get_split())
131
132
  }
132
133
 
133
- pub fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
134
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
134
+ pub fn metaspace_set_split(&self, split: bool) {
135
+ setter!(self, Metaspace, @set_split, split);
136
+ }
137
+
138
+ pub fn metaspace_prepend_scheme(&self) -> String {
139
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
140
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
141
+ match scheme {
142
+ PrependScheme::First => "first",
143
+ PrependScheme::Never => "never",
144
+ PrependScheme::Always => "always",
145
+ }
146
+ .to_string()
147
+ }
148
+
149
+ pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
150
+ let scheme = from_string(prepend_scheme)?;
151
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
152
+ Ok(())
135
153
  }
136
154
 
137
155
  pub fn word_piece_cleanup(&self) -> bool {
@@ -194,8 +212,9 @@ impl RbFuse {
194
212
  pub struct RbMetaspaceDecoder {}
195
213
 
196
214
  impl RbMetaspaceDecoder {
197
- pub fn new(replacement: char, add_prefix_space: bool) -> RbDecoder {
198
- Metaspace::new(replacement, add_prefix_space).into()
215
+ pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
216
+ let prepend_scheme = from_string(prepend_scheme)?;
217
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
199
218
  }
200
219
  }
201
220
 
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
364
383
  class.define_singleton_method("new", function!(RbFuse::new, 0))?;
365
384
 
366
385
  let class = module.define_class("Metaspace", decoder)?;
367
- class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
368
- class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
369
- class.define_method("add_prefix_space=", method!(RbDecoder::metaspace_set_add_prefix_space, 1))?;
386
+ class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
387
+ class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
388
+ class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
370
389
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
371
390
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
391
+ class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
392
+ class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
372
393
 
373
394
  let class = module.define_class("Replace", decoder)?;
374
395
  class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
@@ -222,8 +222,8 @@ pub struct RbSequence {}
222
222
  impl RbSequence {
223
223
  fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
224
224
  let mut sequence = Vec::with_capacity(normalizers.len());
225
- for n in normalizers.each() {
226
- let normalizer: &RbNormalizer = TryConvert::try_convert(n?)?;
225
+ for n in normalizers.into_iter() {
226
+ let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
227
227
  match &normalizer.normalizer {
228
228
  RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
229
229
  RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
@@ -1,7 +1,7 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
3
  use magnus::{
4
- data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
4
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
5
5
  RArray, RClass, RModule, Ruby, TryConvert, TypedData,
6
6
  };
7
7
 
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
12
12
  use tk::pre_tokenizers::byte_level::ByteLevel;
13
13
  use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
14
14
  use tk::pre_tokenizers::digits::Digits;
15
- use tk::pre_tokenizers::metaspace::Metaspace;
15
+ use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
16
16
  use tk::pre_tokenizers::punctuation::Punctuation;
17
17
  use tk::pre_tokenizers::split::Split;
18
18
  use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
118
118
  setter!(self, Digits, individual_digits, individual_digits);
119
119
  }
120
120
 
121
- fn metaspace_add_prefix_space(&self) -> bool {
122
- getter!(self, Metaspace, add_prefix_space)
123
- }
124
-
125
- fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
126
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
127
- }
128
-
129
121
  fn metaspace_replacement(&self) -> String {
130
122
  getter!(self, Metaspace, get_replacement().to_string())
131
123
  }
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
133
125
  fn metaspace_set_replacement(&self, replacement: char) {
134
126
  setter!(self, Metaspace, @set_replacement, replacement);
135
127
  }
128
+
129
+ fn metaspace_split(&self) -> bool {
130
+ getter!(self, Metaspace, get_split())
131
+ }
132
+
133
+ fn metaspace_set_split(&self, split: bool) {
134
+ setter!(self, Metaspace, @set_split, split);
135
+ }
136
+
137
+ fn metaspace_prepend_scheme(&self) -> String {
138
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
139
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
140
+ match scheme {
141
+ PrependScheme::First => "first",
142
+ PrependScheme::Never => "never",
143
+ PrependScheme::Always => "always",
144
+ }
145
+ .to_string()
146
+ }
147
+
148
+ fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
149
+ let scheme = from_string(prepend_scheme)?;
150
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
151
+ Ok(())
152
+ }
136
153
  }
137
154
 
138
155
  impl PreTokenizer for RbPreTokenizer {
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
180
197
  impl RbMetaspace {
181
198
  fn new(
182
199
  replacement: char,
183
- add_prefix_space: bool,
184
- ) -> RbPreTokenizer {
185
- Metaspace::new(replacement, add_prefix_space).into()
200
+ prepend_scheme: String,
201
+ split: bool,
202
+ ) -> RbResult<RbPreTokenizer> {
203
+ let prepend_scheme = from_string(prepend_scheme)?;
204
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
186
205
  }
187
206
  }
188
207
 
@@ -239,8 +258,8 @@ pub struct RbSequence {}
239
258
  impl RbSequence {
240
259
  fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
241
260
  let mut sequence = Vec::with_capacity(pre_tokenizers.len());
242
- for n in pre_tokenizers.each() {
243
- let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n?)?;
261
+ for n in pre_tokenizers.into_iter() {
262
+ let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
244
263
  match &pretokenizer.pretok {
245
264
  RbPreTokenizerTypeWrapper::Sequence(inner) => {
246
265
  sequence.extend(inner.iter().cloned())
@@ -252,6 +271,21 @@ impl RbSequence {
252
271
  }
253
272
  }
254
273
 
274
+ pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
275
+ let scheme = match string.as_str() {
276
+ "first" => PrependScheme::First,
277
+ "never" => PrependScheme::Never,
278
+ "always" => PrependScheme::Always,
279
+ _ => {
280
+ return Err(Error::new(exception::arg_error(), format!(
281
+ "{} is an unknown variant, should be one of ['first', 'never', 'always']",
282
+ string
283
+ )));
284
+ }
285
+ };
286
+ Ok(scheme)
287
+ }
288
+
255
289
  #[derive(Clone, Deserialize)]
256
290
  #[serde(untagged)]
257
291
  pub(crate) enum RbPreTokenizerWrapper {
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
465
499
  class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
466
500
 
467
501
  let class = module.define_class("Metaspace", pre_tokenizer)?;
468
- class.define_singleton_method("_new", function!(RbMetaspace::new, 2))?;
469
- class.define_method("add_prefix_space", method!(RbPreTokenizer::metaspace_add_prefix_space, 0))?;
470
- class.define_method("add_prefix_space=", method!(RbPreTokenizer::metaspace_set_add_prefix_space, 1))?;
502
+ class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
503
+ class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
504
+ class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
471
505
  class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
472
506
  class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
507
+ class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
508
+ class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
473
509
 
474
510
  let class = module.define_class("Punctuation", pre_tokenizer)?;
475
511
  class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
@@ -282,12 +282,12 @@ impl RbTokenizer {
282
282
  add_special_tokens: bool,
283
283
  ) -> RbResult<RArray> {
284
284
  let input: Vec<tk::EncodeInput> = input
285
- .each()
285
+ .into_iter()
286
286
  .map(|o| {
287
287
  let input: tk::EncodeInput = if is_pretokenized {
288
- PreTokenizedEncodeInput::try_convert(o?)?.into()
288
+ PreTokenizedEncodeInput::try_convert(o)?.into()
289
289
  } else {
290
- TextEncodeInput::try_convert(o?)?.into()
290
+ TextEncodeInput::try_convert(o)?.into()
291
291
  };
292
292
  Ok(input)
293
293
  })
@@ -319,26 +319,26 @@ impl RbTokenizer {
319
319
  .map_err(RbError::from)
320
320
  }
321
321
 
322
- pub fn set_decoder(&self, decoder: &RbDecoder) {
323
- self.tokenizer.borrow_mut().with_decoder(decoder.clone());
322
+ pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
323
+ self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
324
324
  }
325
325
 
326
- pub fn set_pre_tokenizer(&self, pretok: &RbPreTokenizer) {
326
+ pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
327
327
  self.tokenizer
328
328
  .borrow_mut()
329
- .with_pre_tokenizer(pretok.clone());
329
+ .with_pre_tokenizer(pretok.cloned());
330
330
  }
331
331
 
332
- pub fn set_post_processor(&self, processor: &RbPostProcessor) {
332
+ pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
333
333
  self.tokenizer
334
334
  .borrow_mut()
335
- .with_post_processor(processor.clone());
335
+ .with_post_processor(processor.cloned());
336
336
  }
337
337
 
338
- pub fn set_normalizer(&self, normalizer: &RbNormalizer) {
338
+ pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
339
339
  self.tokenizer
340
340
  .borrow_mut()
341
- .with_normalizer(normalizer.clone());
341
+ .with_normalizer(normalizer.cloned());
342
342
  }
343
343
 
344
344
  pub fn token_to_id(&self, token: String) -> Option<u32> {
@@ -110,9 +110,9 @@ impl RbTrainer {
110
110
  BpeTrainer,
111
111
  special_tokens,
112
112
  special_tokens
113
- .each()
113
+ .into_iter()
114
114
  .map(|token| {
115
- if let Ok(content) = String::try_convert(token?) {
115
+ if let Ok(content) = String::try_convert(token) {
116
116
  Ok(RbAddedToken::from(content, Some(true)).get_token())
117
117
  } else {
118
118
  todo!()
@@ -197,9 +197,9 @@ impl RbTrainer {
197
197
  UnigramTrainer,
198
198
  special_tokens,
199
199
  special_tokens
200
- .each()
200
+ .into_iter()
201
201
  .map(|token| {
202
- if let Ok(content) = String::try_convert(token?) {
202
+ if let Ok(content) = String::try_convert(token) {
203
203
  Ok(RbAddedToken::from(content, Some(true)).get_token())
204
204
  } else {
205
205
  todo!()
@@ -268,9 +268,9 @@ impl RbTrainer {
268
268
  WordLevelTrainer,
269
269
  special_tokens,
270
270
  special_tokens
271
- .each()
271
+ .into_iter()
272
272
  .map(|token| {
273
- if let Ok(content) = String::try_convert(token?) {
273
+ if let Ok(content) = String::try_convert(token) {
274
274
  Ok(RbAddedToken::from(content, Some(true)).get_token())
275
275
  } else {
276
276
  todo!()
@@ -322,9 +322,9 @@ impl RbTrainer {
322
322
  WordPieceTrainer,
323
323
  @set_special_tokens,
324
324
  special_tokens
325
- .each()
325
+ .into_iter()
326
326
  .map(|token| {
327
- if let Ok(content) = String::try_convert(token?) {
327
+ if let Ok(content) = String::try_convert(token) {
328
328
  Ok(RbAddedToken::from(content, Some(true)).get_token())
329
329
  } else {
330
330
  todo!()
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
398
398
  if !value.is_nil() {
399
399
  builder = builder.special_tokens(
400
400
  RArray::try_convert(value)?
401
- .each()
401
+ .into_iter()
402
402
  .map(|token| {
403
- if let Ok(content) = String::try_convert(token?) {
403
+ if let Ok(content) = String::try_convert(token) {
404
404
  Ok(RbAddedToken::from(content, Some(true)).get_token())
405
405
  } else {
406
406
  todo!()
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
466
466
  if !value.is_nil() {
467
467
  builder.special_tokens(
468
468
  RArray::try_convert(value)?
469
- .each()
469
+ .into_iter()
470
470
  .map(|token| {
471
- if let Ok(content) = String::try_convert(token?) {
471
+ if let Ok(content) = String::try_convert(token) {
472
472
  Ok(RbAddedToken::from(content, Some(true)).get_token())
473
473
  } else {
474
474
  todo!()
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
540
540
  if !value.is_nil() {
541
541
  builder.special_tokens(
542
542
  RArray::try_convert(value)?
543
- .each()
543
+ .into_iter()
544
544
  .map(|token| {
545
- if let Ok(content) = String::try_convert(token?) {
545
+ if let Ok(content) = String::try_convert(token) {
546
546
  Ok(RbAddedToken::from(content, Some(true)).get_token())
547
547
  } else {
548
548
  todo!()
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
581
581
  if !value.is_nil() {
582
582
  builder = builder.special_tokens(
583
583
  RArray::try_convert(value)?
584
- .each()
584
+ .into_iter()
585
585
  .map(|token| {
586
- if let Ok(content) = String::try_convert(token?) {
586
+ if let Ok(content) = String::try_convert(token) {
587
587
  Ok(RbAddedToken::from(content, Some(true)).get_token())
588
588
  } else {
589
589
  todo!()
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Decoders
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.2"
4
+ TOKENIZERS_VERSION = "0.20.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -67,7 +67,7 @@ module Tokenizers
67
67
  end
68
68
  end
69
69
 
70
- options[:content_length_proc] = -> (_) { puts "Downloading..." }
70
+ options[:content_length_proc] = ->(_) { puts "Downloading..." }
71
71
 
72
72
  # string options are headers
73
73
  tempfile = URI.parse(url).open(headers.merge(options))
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module PreTokenizers
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.4"
2
+ VERSION = "0.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-27 00:00:00.000000000 Z
11
+ date: 2024-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: '3'
96
+ version: '3.1'
97
97
  required_rubygems_version: !ruby/object:Gem::Requirement
98
98
  requirements:
99
99
  - - ">="
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.5.3
103
+ rubygems_version: 3.5.11
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby