tokenizers 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,16 +3,16 @@ use std::sync::{Arc, RwLock};
3
3
 
4
4
  use crate::models::RbModel;
5
5
  use crate::tokenizer::RbAddedToken;
6
- use magnus::typed_data::DataTypeBuilder;
6
+ use magnus::prelude::*;
7
7
  use magnus::{
8
- exception, function, memoize, method, Class, DataType, DataTypeFunctions, Error, Module, Object,
9
- RArray, RClass, RHash, RModule, Symbol, TypedData, Value,
8
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
9
+ RArray, RClass, RHash, RModule, Ruby, Symbol, TryConvert, TypedData, Value,
10
10
  };
11
11
  use serde::{Deserialize, Serialize};
12
12
  use tk::models::TrainerWrapper;
13
13
  use tk::Trainer;
14
14
 
15
- use super::RbResult;
15
+ use super::{RbResult, TRAINERS};
16
16
 
17
17
  #[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
18
18
  pub struct RbTrainer {
@@ -112,7 +112,7 @@ impl RbTrainer {
112
112
  special_tokens
113
113
  .each()
114
114
  .map(|token| {
115
- if let Ok(content) = token?.try_convert::<String>() {
115
+ if let Ok(content) = String::try_convert(token?) {
116
116
  Ok(RbAddedToken::from(content, Some(true)).get_token())
117
117
  } else {
118
118
  todo!()
@@ -144,7 +144,7 @@ impl RbTrainer {
144
144
  self,
145
145
  BpeTrainer,
146
146
  initial_alphabet,
147
- alphabet.into_iter().map(|c| c).collect()
147
+ alphabet.into_iter().collect()
148
148
  );
149
149
  }
150
150
 
@@ -199,7 +199,7 @@ impl RbTrainer {
199
199
  special_tokens
200
200
  .each()
201
201
  .map(|token| {
202
- if let Ok(content) = token?.try_convert::<String>() {
202
+ if let Ok(content) = String::try_convert(token?) {
203
203
  Ok(RbAddedToken::from(content, Some(true)).get_token())
204
204
  } else {
205
205
  todo!()
@@ -223,7 +223,7 @@ impl RbTrainer {
223
223
  self,
224
224
  UnigramTrainer,
225
225
  initial_alphabet,
226
- alphabet.into_iter().map(|c| c).collect()
226
+ alphabet.into_iter().collect()
227
227
  );
228
228
  }
229
229
 
@@ -270,7 +270,7 @@ impl RbTrainer {
270
270
  special_tokens
271
271
  .each()
272
272
  .map(|token| {
273
- if let Ok(content) = token?.try_convert::<String>() {
273
+ if let Ok(content) = String::try_convert(token?) {
274
274
  Ok(RbAddedToken::from(content, Some(true)).get_token())
275
275
  } else {
276
276
  todo!()
@@ -324,7 +324,7 @@ impl RbTrainer {
324
324
  special_tokens
325
325
  .each()
326
326
  .map(|token| {
327
- if let Ok(content) = token?.try_convert::<String>() {
327
+ if let Ok(content) = String::try_convert(token?) {
328
328
  Ok(RbAddedToken::from(content, Some(true)).get_token())
329
329
  } else {
330
330
  todo!()
@@ -356,7 +356,7 @@ impl RbTrainer {
356
356
  self,
357
357
  WordPieceTrainer,
358
358
  @set_initial_alphabet,
359
- alphabet.into_iter().map(|c| c).collect()
359
+ alphabet.into_iter().collect()
360
360
  );
361
361
  }
362
362
 
@@ -397,11 +397,10 @@ impl RbBpeTrainer {
397
397
  let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
398
398
  if !value.is_nil() {
399
399
  builder = builder.special_tokens(
400
- value
401
- .try_convert::<RArray>()?
400
+ RArray::try_convert(value)?
402
401
  .each()
403
402
  .map(|token| {
404
- if let Ok(content) = token?.try_convert::<String>() {
403
+ if let Ok(content) = String::try_convert(token?) {
405
404
  Ok(RbAddedToken::from(content, Some(true)).get_token())
406
405
  } else {
407
406
  todo!()
@@ -413,39 +412,39 @@ impl RbBpeTrainer {
413
412
 
414
413
  let value: Value = kwargs.delete(Symbol::new("initial_alphabet"))?;
415
414
  if !value.is_nil() {
416
- let arr = value.try_convert::<Vec<char>>()?;
415
+ let arr = <Vec<char>>::try_convert(value)?;
417
416
  let set: HashSet<char> = HashSet::from_iter(arr);
418
417
  builder = builder.initial_alphabet(set);
419
418
  }
420
419
 
421
420
  let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
422
421
  if !value.is_nil() {
423
- builder = builder.vocab_size(value.try_convert()?);
422
+ builder = builder.vocab_size(TryConvert::try_convert(value)?);
424
423
  }
425
424
 
426
425
  let value: Value = kwargs.delete(Symbol::new("min_frequency"))?;
427
426
  if !value.is_nil() {
428
- builder = builder.min_frequency(value.try_convert()?);
427
+ builder = builder.min_frequency(TryConvert::try_convert(value)?);
429
428
  }
430
429
 
431
430
  let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
432
431
  if !value.is_nil() {
433
- builder = builder.show_progress(value.try_convert()?);
432
+ builder = builder.show_progress(TryConvert::try_convert(value)?);
434
433
  }
435
434
 
436
435
  let value: Value = kwargs.delete(Symbol::new("limit_alphabet"))?;
437
436
  if !value.is_nil() {
438
- builder = builder.limit_alphabet(value.try_convert()?);
437
+ builder = builder.limit_alphabet(TryConvert::try_convert(value)?);
439
438
  }
440
439
 
441
440
  let value: Value = kwargs.delete(Symbol::new("continuing_subword_prefix"))?;
442
441
  if !value.is_nil() {
443
- builder = builder.continuing_subword_prefix(value.try_convert()?);
442
+ builder = builder.continuing_subword_prefix(TryConvert::try_convert(value)?);
444
443
  }
445
444
 
446
445
  let value: Value = kwargs.delete(Symbol::new("end_of_word_suffix"))?;
447
446
  if !value.is_nil() {
448
- builder = builder.end_of_word_suffix(value.try_convert()?);
447
+ builder = builder.end_of_word_suffix(TryConvert::try_convert(value)?);
449
448
  }
450
449
 
451
450
  if !kwargs.is_empty() {
@@ -466,11 +465,10 @@ impl RbUnigramTrainer {
466
465
  let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
467
466
  if !value.is_nil() {
468
467
  builder.special_tokens(
469
- value
470
- .try_convert::<RArray>()?
468
+ RArray::try_convert(value)?
471
469
  .each()
472
470
  .map(|token| {
473
- if let Ok(content) = token?.try_convert::<String>() {
471
+ if let Ok(content) = String::try_convert(token?) {
474
472
  Ok(RbAddedToken::from(content, Some(true)).get_token())
475
473
  } else {
476
474
  todo!()
@@ -482,44 +480,44 @@ impl RbUnigramTrainer {
482
480
 
483
481
  let value: Value = kwargs.delete(Symbol::new("initial_alphabet"))?;
484
482
  if !value.is_nil() {
485
- let arr = value.try_convert::<Vec<char>>()?;
483
+ let arr = <Vec<char>>::try_convert(value)?;
486
484
  let set: HashSet<char> = HashSet::from_iter(arr);
487
485
  builder.initial_alphabet(set);
488
486
  }
489
487
 
490
488
  let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
491
489
  if !value.is_nil() {
492
- builder.vocab_size(value.try_convert()?);
490
+ builder.vocab_size(TryConvert::try_convert(value)?);
493
491
  }
494
492
 
495
493
  let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
496
494
  if !value.is_nil() {
497
- builder.show_progress(value.try_convert()?);
495
+ builder.show_progress(TryConvert::try_convert(value)?);
498
496
  }
499
497
 
500
498
  let value: Value = kwargs.delete(Symbol::new("n_sub_iterations"))?;
501
499
  if !value.is_nil() {
502
- builder.n_sub_iterations(value.try_convert()?);
500
+ builder.n_sub_iterations(TryConvert::try_convert(value)?);
503
501
  }
504
502
 
505
503
  let value: Value = kwargs.delete(Symbol::new("unk_token"))?;
506
504
  if !value.is_nil() {
507
- builder.unk_token(Some(value.try_convert()?));
505
+ builder.unk_token(Some(TryConvert::try_convert(value)?));
508
506
  }
509
507
 
510
508
  let value: Value = kwargs.delete(Symbol::new("max_piece_length"))?;
511
509
  if !value.is_nil() {
512
- builder.max_piece_length(value.try_convert()?);
510
+ builder.max_piece_length(TryConvert::try_convert(value)?);
513
511
  }
514
512
 
515
513
  let value: Value = kwargs.delete(Symbol::new("seed_size"))?;
516
514
  if !value.is_nil() {
517
- builder.seed_size(value.try_convert()?);
515
+ builder.seed_size(TryConvert::try_convert(value)?);
518
516
  }
519
517
 
520
518
  let value: Value = kwargs.delete(Symbol::new("shrinking_factor"))?;
521
519
  if !value.is_nil() {
522
- builder.shrinking_factor(value.try_convert()?);
520
+ builder.shrinking_factor(TryConvert::try_convert(value)?);
523
521
  }
524
522
 
525
523
  if !kwargs.is_empty() {
@@ -541,11 +539,10 @@ impl RbWordLevelTrainer {
541
539
  let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
542
540
  if !value.is_nil() {
543
541
  builder.special_tokens(
544
- value
545
- .try_convert::<RArray>()?
542
+ RArray::try_convert(value)?
546
543
  .each()
547
544
  .map(|token| {
548
- if let Ok(content) = token?.try_convert::<String>() {
545
+ if let Ok(content) = String::try_convert(token?) {
549
546
  Ok(RbAddedToken::from(content, Some(true)).get_token())
550
547
  } else {
551
548
  todo!()
@@ -557,17 +554,17 @@ impl RbWordLevelTrainer {
557
554
 
558
555
  let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
559
556
  if !value.is_nil() {
560
- builder.vocab_size(value.try_convert()?);
557
+ builder.vocab_size(TryConvert::try_convert(value)?);
561
558
  }
562
559
 
563
560
  let value: Value = kwargs.delete(Symbol::new("min_frequency"))?;
564
561
  if !value.is_nil() {
565
- builder.min_frequency(value.try_convert()?);
562
+ builder.min_frequency(TryConvert::try_convert(value)?);
566
563
  }
567
564
 
568
565
  let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
569
566
  if !value.is_nil() {
570
- builder.show_progress(value.try_convert()?);
567
+ builder.show_progress(TryConvert::try_convert(value)?);
571
568
  }
572
569
 
573
570
  Ok(builder.build().expect("WordLevelTrainerBuilder cannot fail").into())
@@ -583,11 +580,10 @@ impl RbWordPieceTrainer {
583
580
  let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
584
581
  if !value.is_nil() {
585
582
  builder = builder.special_tokens(
586
- value
587
- .try_convert::<RArray>()?
583
+ RArray::try_convert(value)?
588
584
  .each()
589
585
  .map(|token| {
590
- if let Ok(content) = token?.try_convert::<String>() {
586
+ if let Ok(content) = String::try_convert(token?) {
591
587
  Ok(RbAddedToken::from(content, Some(true)).get_token())
592
588
  } else {
593
589
  todo!()
@@ -599,39 +595,39 @@ impl RbWordPieceTrainer {
599
595
 
600
596
  let value: Value = kwargs.delete(Symbol::new("initial_alphabet"))?;
601
597
  if !value.is_nil() {
602
- let arr = value.try_convert::<Vec<char>>()?;
598
+ let arr = <Vec<char>>::try_convert(value)?;
603
599
  let set: HashSet<char> = HashSet::from_iter(arr);
604
600
  builder = builder.initial_alphabet(set);
605
601
  }
606
602
 
607
603
  let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
608
604
  if !value.is_nil() {
609
- builder = builder.vocab_size(value.try_convert()?);
605
+ builder = builder.vocab_size(TryConvert::try_convert(value)?);
610
606
  }
611
607
 
612
608
  let value: Value = kwargs.delete(Symbol::new("min_frequency"))?;
613
609
  if !value.is_nil() {
614
- builder = builder.min_frequency(value.try_convert()?);
610
+ builder = builder.min_frequency(TryConvert::try_convert(value)?);
615
611
  }
616
612
 
617
613
  let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
618
614
  if !value.is_nil() {
619
- builder = builder.show_progress(value.try_convert()?);
615
+ builder = builder.show_progress(TryConvert::try_convert(value)?);
620
616
  }
621
617
 
622
618
  let value: Value = kwargs.delete(Symbol::new("limit_alphabet"))?;
623
619
  if !value.is_nil() {
624
- builder = builder.limit_alphabet(value.try_convert()?);
620
+ builder = builder.limit_alphabet(TryConvert::try_convert(value)?);
625
621
  }
626
622
 
627
623
  let value: Value = kwargs.delete(Symbol::new("continuing_subword_prefix"))?;
628
624
  if !value.is_nil() {
629
- builder = builder.continuing_subword_prefix(value.try_convert()?);
625
+ builder = builder.continuing_subword_prefix(TryConvert::try_convert(value)?);
630
626
  }
631
627
 
632
628
  let value: Value = kwargs.delete(Symbol::new("end_of_word_suffix"))?;
633
629
  if !value.is_nil() {
634
- builder = builder.end_of_word_suffix(value.try_convert()?);
630
+ builder = builder.end_of_word_suffix(TryConvert::try_convert(value)?);
635
631
  }
636
632
 
637
633
  if !kwargs.is_empty() {
@@ -644,46 +640,52 @@ impl RbWordPieceTrainer {
644
640
  }
645
641
 
646
642
  unsafe impl TypedData for RbTrainer {
647
- fn class() -> RClass {
648
- *memoize!(RClass: {
649
- let class: RClass = crate::trainers().const_get("Trainer").unwrap();
650
- class.undef_alloc_func();
651
- class
652
- })
643
+ fn class(ruby: &Ruby) -> RClass {
644
+ static CLASS: Lazy<RClass> = Lazy::new(|ruby| {
645
+ let class: RClass = ruby.get_inner(&TRAINERS).const_get("Trainer").unwrap();
646
+ class.undef_default_alloc_func();
647
+ class
648
+ });
649
+ ruby.get_inner(&CLASS)
653
650
  }
654
651
 
655
652
  fn data_type() -> &'static DataType {
656
- memoize!(DataType: DataTypeBuilder::<RbTrainer>::new("Tokenizers::Trainers::Trainer").build())
657
- }
658
-
659
- fn class_for(value: &Self) -> RClass {
653
+ static DATA_TYPE: DataType = data_type_builder!(RbTrainer, "Tokenizers::Trainers::Trainer").build();
654
+ &DATA_TYPE
655
+ }
656
+
657
+ fn class_for(ruby: &Ruby, value: &Self) -> RClass {
658
+ static BPE_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
659
+ let class: RClass = ruby.get_inner(&TRAINERS).const_get("BpeTrainer").unwrap();
660
+ class.undef_default_alloc_func();
661
+ class
662
+ });
663
+ static UNIGRAM_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
664
+ let class: RClass = ruby.get_inner(&TRAINERS).const_get("UnigramTrainer").unwrap();
665
+ class.undef_default_alloc_func();
666
+ class
667
+ });
668
+ static WORD_LEVEL_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
669
+ let class: RClass = ruby.get_inner(&TRAINERS).const_get("WordLevelTrainer").unwrap();
670
+ class.undef_default_alloc_func();
671
+ class
672
+ });
673
+ static WORD_PIECE_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
674
+ let class: RClass = ruby.get_inner(&TRAINERS).const_get("WordPieceTrainer").unwrap();
675
+ class.undef_default_alloc_func();
676
+ class
677
+ });
660
678
  match *value.trainer.read().unwrap() {
661
- TrainerWrapper::BpeTrainer(_) => *memoize!(RClass: {
662
- let class: RClass = crate::trainers().const_get("BpeTrainer").unwrap();
663
- class.undef_alloc_func();
664
- class
665
- }),
666
- TrainerWrapper::UnigramTrainer(_) => *memoize!(RClass: {
667
- let class: RClass = crate::trainers().const_get("UnigramTrainer").unwrap();
668
- class.undef_alloc_func();
669
- class
670
- }),
671
- TrainerWrapper::WordLevelTrainer(_) => *memoize!(RClass: {
672
- let class: RClass = crate::trainers().const_get("WordLevelTrainer").unwrap();
673
- class.undef_alloc_func();
674
- class
675
- }),
676
- TrainerWrapper::WordPieceTrainer(_) => *memoize!(RClass: {
677
- let class: RClass = crate::trainers().const_get("WordPieceTrainer").unwrap();
678
- class.undef_alloc_func();
679
- class
680
- }),
679
+ TrainerWrapper::BpeTrainer(_) => ruby.get_inner(&BPE_TRAINER),
680
+ TrainerWrapper::UnigramTrainer(_) => ruby.get_inner(&UNIGRAM_TRAINER),
681
+ TrainerWrapper::WordLevelTrainer(_) => ruby.get_inner(&WORD_LEVEL_TRAINER),
682
+ TrainerWrapper::WordPieceTrainer(_) => ruby.get_inner(&WORD_PIECE_TRAINER),
681
683
  }
682
684
  }
683
685
  }
684
686
 
685
- pub fn trainers(module: &RModule) -> RbResult<()> {
686
- let trainer = module.define_class("Trainer", Default::default())?;
687
+ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
688
+ let trainer = module.define_class("Trainer", ruby.class_object())?;
687
689
 
688
690
  let class = module.define_class("BpeTrainer", trainer)?;
689
691
  class.define_singleton_method("_new", function!(RbBpeTrainer::new, 1))?;
@@ -1,5 +1,6 @@
1
1
  use super::regex::{regex, RbRegex};
2
2
  use crate::RbResult;
3
+ use magnus::prelude::*;
3
4
  use magnus::{exception, Error, TryConvert, Value};
4
5
  use tk::normalizer::SplitDelimiterBehavior;
5
6
  use tk::pattern::Pattern;
@@ -13,9 +14,9 @@ pub enum RbPattern<'p> {
13
14
  impl TryConvert for RbPattern<'_> {
14
15
  fn try_convert(obj: Value) -> RbResult<Self> {
15
16
  if obj.is_kind_of(regex()) {
16
- Ok(RbPattern::Regex(obj.try_convert()?))
17
+ Ok(RbPattern::Regex(TryConvert::try_convert(obj)?))
17
18
  } else {
18
- Ok(RbPattern::Str(obj.try_convert()?))
19
+ Ok(RbPattern::Str(TryConvert::try_convert(obj)?))
19
20
  }
20
21
  }
21
22
  }
@@ -61,7 +62,7 @@ pub struct RbSplitDelimiterBehavior(pub SplitDelimiterBehavior);
61
62
 
62
63
  impl TryConvert for RbSplitDelimiterBehavior {
63
64
  fn try_convert(obj: Value) -> RbResult<Self> {
64
- let s = obj.try_convert::<String>()?;
65
+ let s = String::try_convert(obj)?;
65
66
 
66
67
  Ok(Self(match s.as_str() {
67
68
  "removed" => Ok(SplitDelimiterBehavior::Removed),
@@ -1,6 +1,6 @@
1
1
  use onig::Regex;
2
- use magnus::{exception, memoize, Error, Module, RClass};
3
- use crate::{module, RbResult};
2
+ use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
3
+ use crate::{RbResult, TOKENIZERS};
4
4
 
5
5
  #[magnus::wrap(class = "Tokenizers::Regex")]
6
6
  pub struct RbRegex {
@@ -17,6 +17,8 @@ impl RbRegex {
17
17
  }
18
18
  }
19
19
 
20
+ static REGEX: Lazy<RClass> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Regex").unwrap());
21
+
20
22
  pub fn regex() -> RClass {
21
- *memoize!(RClass: module().const_get("Regex").unwrap())
23
+ Ruby::get().unwrap().get_inner(&REGEX)
22
24
  }
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.13.3"
4
+ TOKENIZERS_VERSION = "0.14.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Models
3
3
  class Unigram
4
- def self.new(vocab: nil, unk_id: nil)
5
- _new(vocab, unk_id)
4
+ def self.new(vocab: nil, unk_id: nil, byte_fallback: nil)
5
+ _new(vocab, unk_id, byte_fallback)
6
6
  end
7
7
  end
8
8
  end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.3.3"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  # ext
2
2
  begin
3
- require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require_relative "tokenizers/tokenizers"
5
+ require "tokenizers/tokenizers"
6
6
  end
7
7
 
8
8
  # decoders
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-09 00:00:00.000000000 Z
11
+ date: 2023-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -93,7 +93,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: '2.7'
96
+ version: '3'
97
97
  required_rubygems_version: !ruby/object:Gem::Requirement
98
98
  requirements:
99
99
  - - ">="