tokenizers 0.5.3 → 0.5.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,8 +5,9 @@ use crate::models::RbModel;
5
5
  use crate::tokenizer::RbAddedToken;
6
6
  use magnus::prelude::*;
7
7
  use magnus::{
8
- data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
9
- RArray, RClass, RHash, RModule, Ruby, Symbol, TryConvert, TypedData, Value,
8
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType,
9
+ DataTypeFunctions, Error, Module, Object, RArray, RClass, RHash, RModule, Ruby, Symbol,
10
+ TryConvert, TypedData, Value,
10
11
  };
11
12
  use serde::{Deserialize, Serialize};
12
13
  use tk::models::TrainerWrapper;
@@ -68,7 +69,6 @@ macro_rules! setter {
68
69
  }
69
70
 
70
71
  impl RbTrainer {
71
-
72
72
  fn bpe_trainer_vocab_size(&self) -> usize {
73
73
  getter!(self, BpeTrainer, vocab_size)
74
74
  }
@@ -525,7 +525,9 @@ impl RbUnigramTrainer {
525
525
  return Err(Error::new(exception::arg_error(), "unknown keyword"));
526
526
  }
527
527
 
528
- let trainer = builder.build().map_err(|_| { Error::new(exception::arg_error(), "Cannot build UnigramTrainer") })?;
528
+ let trainer = builder
529
+ .build()
530
+ .map_err(|_| Error::new(exception::arg_error(), "Cannot build UnigramTrainer"))?;
529
531
  Ok(trainer.into())
530
532
  }
531
533
  }
@@ -567,7 +569,10 @@ impl RbWordLevelTrainer {
567
569
  builder.show_progress(TryConvert::try_convert(value)?);
568
570
  }
569
571
 
570
- Ok(builder.build().expect("WordLevelTrainerBuilder cannot fail").into())
572
+ Ok(builder
573
+ .build()
574
+ .expect("WordLevelTrainerBuilder cannot fail")
575
+ .into())
571
576
  }
572
577
  }
573
578
 
@@ -650,7 +655,8 @@ unsafe impl TypedData for RbTrainer {
650
655
  }
651
656
 
652
657
  fn data_type() -> &'static DataType {
653
- static DATA_TYPE: DataType = data_type_builder!(RbTrainer, "Tokenizers::Trainers::Trainer").build();
658
+ static DATA_TYPE: DataType =
659
+ data_type_builder!(RbTrainer, "Tokenizers::Trainers::Trainer").build();
654
660
  &DATA_TYPE
655
661
  }
656
662
 
@@ -661,17 +667,26 @@ unsafe impl TypedData for RbTrainer {
661
667
  class
662
668
  });
663
669
  static UNIGRAM_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
664
- let class: RClass = ruby.get_inner(&TRAINERS).const_get("UnigramTrainer").unwrap();
670
+ let class: RClass = ruby
671
+ .get_inner(&TRAINERS)
672
+ .const_get("UnigramTrainer")
673
+ .unwrap();
665
674
  class.undef_default_alloc_func();
666
675
  class
667
676
  });
668
677
  static WORD_LEVEL_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
669
- let class: RClass = ruby.get_inner(&TRAINERS).const_get("WordLevelTrainer").unwrap();
678
+ let class: RClass = ruby
679
+ .get_inner(&TRAINERS)
680
+ .const_get("WordLevelTrainer")
681
+ .unwrap();
670
682
  class.undef_default_alloc_func();
671
683
  class
672
684
  });
673
685
  static WORD_PIECE_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
674
- let class: RClass = ruby.get_inner(&TRAINERS).const_get("WordPieceTrainer").unwrap();
686
+ let class: RClass = ruby
687
+ .get_inner(&TRAINERS)
688
+ .const_get("WordPieceTrainer")
689
+ .unwrap();
675
690
  class.undef_default_alloc_func();
676
691
  class
677
692
  });
@@ -690,62 +705,206 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
690
705
  let class = module.define_class("BpeTrainer", trainer)?;
691
706
  class.define_singleton_method("_new", function!(RbBpeTrainer::new, 1))?;
692
707
  class.define_method("vocab_size", method!(RbTrainer::bpe_trainer_vocab_size, 0))?;
693
- class.define_method("vocab_size=", method!(RbTrainer::bpe_trainer_set_vocab_size, 1))?;
694
- class.define_method("min_frequency", method!(RbTrainer::bpe_trainer_min_frequency, 0))?;
695
- class.define_method("min_frequency=", method!(RbTrainer::bpe_trainer_set_min_frequency, 1))?;
696
- class.define_method("show_progress", method!(RbTrainer::bpe_trainer_show_progress, 0))?;
697
- class.define_method("show_progress=", method!(RbTrainer::bpe_trainer_set_show_progress, 1))?;
698
- class.define_method("special_tokens", method!(RbTrainer::bpe_trainer_special_tokens, 0))?;
699
- class.define_method("special_tokens=", method!(RbTrainer::bpe_trainer_set_special_tokens, 1))?;
700
- class.define_method("limit_alphabet", method!(RbTrainer::bpe_trainer_limit_alphabet, 0))?;
701
- class.define_method("limit_alphabet=", method!(RbTrainer::bpe_trainer_set_limit_alphabet, 1))?;
702
- class.define_method("initial_alphabet", method!(RbTrainer::bpe_trainer_initial_alphabet, 0))?;
703
- class.define_method("initial_alphabet=", method!(RbTrainer::bpe_trainer_set_initial_alphabet, 1))?;
704
- class.define_method("continuing_subword_prefix", method!(RbTrainer::bpe_trainer_continuing_subword_prefix, 0))?;
705
- class.define_method("continuing_subword_prefix=", method!(RbTrainer::bpe_trainer_set_continuing_subword_prefix, 1))?;
706
- class.define_method("end_of_word_suffix", method!(RbTrainer::bpe_trainer_end_of_word_suffix, 0))?;
707
- class.define_method("end_of_word_suffix=", method!(RbTrainer::bpe_trainer_set_end_of_word_suffix, 1))?;
708
+ class.define_method(
709
+ "vocab_size=",
710
+ method!(RbTrainer::bpe_trainer_set_vocab_size, 1),
711
+ )?;
712
+ class.define_method(
713
+ "min_frequency",
714
+ method!(RbTrainer::bpe_trainer_min_frequency, 0),
715
+ )?;
716
+ class.define_method(
717
+ "min_frequency=",
718
+ method!(RbTrainer::bpe_trainer_set_min_frequency, 1),
719
+ )?;
720
+ class.define_method(
721
+ "show_progress",
722
+ method!(RbTrainer::bpe_trainer_show_progress, 0),
723
+ )?;
724
+ class.define_method(
725
+ "show_progress=",
726
+ method!(RbTrainer::bpe_trainer_set_show_progress, 1),
727
+ )?;
728
+ class.define_method(
729
+ "special_tokens",
730
+ method!(RbTrainer::bpe_trainer_special_tokens, 0),
731
+ )?;
732
+ class.define_method(
733
+ "special_tokens=",
734
+ method!(RbTrainer::bpe_trainer_set_special_tokens, 1),
735
+ )?;
736
+ class.define_method(
737
+ "limit_alphabet",
738
+ method!(RbTrainer::bpe_trainer_limit_alphabet, 0),
739
+ )?;
740
+ class.define_method(
741
+ "limit_alphabet=",
742
+ method!(RbTrainer::bpe_trainer_set_limit_alphabet, 1),
743
+ )?;
744
+ class.define_method(
745
+ "initial_alphabet",
746
+ method!(RbTrainer::bpe_trainer_initial_alphabet, 0),
747
+ )?;
748
+ class.define_method(
749
+ "initial_alphabet=",
750
+ method!(RbTrainer::bpe_trainer_set_initial_alphabet, 1),
751
+ )?;
752
+ class.define_method(
753
+ "continuing_subword_prefix",
754
+ method!(RbTrainer::bpe_trainer_continuing_subword_prefix, 0),
755
+ )?;
756
+ class.define_method(
757
+ "continuing_subword_prefix=",
758
+ method!(RbTrainer::bpe_trainer_set_continuing_subword_prefix, 1),
759
+ )?;
760
+ class.define_method(
761
+ "end_of_word_suffix",
762
+ method!(RbTrainer::bpe_trainer_end_of_word_suffix, 0),
763
+ )?;
764
+ class.define_method(
765
+ "end_of_word_suffix=",
766
+ method!(RbTrainer::bpe_trainer_set_end_of_word_suffix, 1),
767
+ )?;
708
768
 
709
769
  let class = module.define_class("UnigramTrainer", trainer)?;
710
770
  class.define_singleton_method("_new", function!(RbUnigramTrainer::new, 1))?;
711
- class.define_method("vocab_size", method!(RbTrainer::unigram_trainer_vocab_size, 0))?;
712
- class.define_method("vocab_size=", method!(RbTrainer::unigram_trainer_set_vocab_size, 1))?;
713
- class.define_method("show_progress", method!(RbTrainer::unigram_trainer_show_progress, 0))?;
714
- class.define_method("show_progress=", method!(RbTrainer::unigram_trainer_set_show_progress, 1))?;
715
- class.define_method("special_tokens", method!(RbTrainer::unigram_trainer_special_tokens, 0))?;
716
- class.define_method("special_tokens=", method!(RbTrainer::unigram_trainer_set_special_tokens, 1))?;
717
- class.define_method("initial_alphabet", method!(RbTrainer::unigram_trainer_initial_alphabet, 0))?;
718
- class.define_method("initial_alphabet=", method!(RbTrainer::unigram_trainer_set_initial_alphabet, 1))?;
771
+ class.define_method(
772
+ "vocab_size",
773
+ method!(RbTrainer::unigram_trainer_vocab_size, 0),
774
+ )?;
775
+ class.define_method(
776
+ "vocab_size=",
777
+ method!(RbTrainer::unigram_trainer_set_vocab_size, 1),
778
+ )?;
779
+ class.define_method(
780
+ "show_progress",
781
+ method!(RbTrainer::unigram_trainer_show_progress, 0),
782
+ )?;
783
+ class.define_method(
784
+ "show_progress=",
785
+ method!(RbTrainer::unigram_trainer_set_show_progress, 1),
786
+ )?;
787
+ class.define_method(
788
+ "special_tokens",
789
+ method!(RbTrainer::unigram_trainer_special_tokens, 0),
790
+ )?;
791
+ class.define_method(
792
+ "special_tokens=",
793
+ method!(RbTrainer::unigram_trainer_set_special_tokens, 1),
794
+ )?;
795
+ class.define_method(
796
+ "initial_alphabet",
797
+ method!(RbTrainer::unigram_trainer_initial_alphabet, 0),
798
+ )?;
799
+ class.define_method(
800
+ "initial_alphabet=",
801
+ method!(RbTrainer::unigram_trainer_set_initial_alphabet, 1),
802
+ )?;
719
803
 
720
804
  let class = module.define_class("WordLevelTrainer", trainer)?;
721
805
  class.define_singleton_method("_new", function!(RbWordLevelTrainer::new, 1))?;
722
- class.define_method("vocab_size", method!(RbTrainer::word_level_trainer_vocab_size, 0))?;
723
- class.define_method("vocab_size=", method!(RbTrainer::word_level_trainer_set_vocab_size, 1))?;
724
- class.define_method("min_frequency", method!(RbTrainer::word_level_trainer_min_frequency, 0))?;
725
- class.define_method("min_frequency=", method!(RbTrainer::word_level_trainer_set_min_frequency, 1))?;
726
- class.define_method("show_progress", method!(RbTrainer::word_level_trainer_show_progress, 0))?;
727
- class.define_method("show_progress=", method!(RbTrainer::word_level_trainer_set_show_progress, 1))?;
728
- class.define_method("special_tokens", method!(RbTrainer::word_level_trainer_special_tokens, 0))?;
729
- class.define_method("special_tokens=", method!(RbTrainer::word_level_trainer_set_special_tokens, 1))?;
806
+ class.define_method(
807
+ "vocab_size",
808
+ method!(RbTrainer::word_level_trainer_vocab_size, 0),
809
+ )?;
810
+ class.define_method(
811
+ "vocab_size=",
812
+ method!(RbTrainer::word_level_trainer_set_vocab_size, 1),
813
+ )?;
814
+ class.define_method(
815
+ "min_frequency",
816
+ method!(RbTrainer::word_level_trainer_min_frequency, 0),
817
+ )?;
818
+ class.define_method(
819
+ "min_frequency=",
820
+ method!(RbTrainer::word_level_trainer_set_min_frequency, 1),
821
+ )?;
822
+ class.define_method(
823
+ "show_progress",
824
+ method!(RbTrainer::word_level_trainer_show_progress, 0),
825
+ )?;
826
+ class.define_method(
827
+ "show_progress=",
828
+ method!(RbTrainer::word_level_trainer_set_show_progress, 1),
829
+ )?;
830
+ class.define_method(
831
+ "special_tokens",
832
+ method!(RbTrainer::word_level_trainer_special_tokens, 0),
833
+ )?;
834
+ class.define_method(
835
+ "special_tokens=",
836
+ method!(RbTrainer::word_level_trainer_set_special_tokens, 1),
837
+ )?;
730
838
 
731
839
  let class = module.define_class("WordPieceTrainer", trainer)?;
732
840
  class.define_singleton_method("_new", function!(RbWordPieceTrainer::new, 1))?;
733
- class.define_method("vocab_size", method!(RbTrainer::word_piece_trainer_vocab_size, 0))?;
734
- class.define_method("vocab_size=", method!(RbTrainer::word_piece_trainer_set_vocab_size, 1))?;
735
- class.define_method("min_frequency", method!(RbTrainer::word_piece_trainer_min_frequency, 0))?;
736
- class.define_method("min_frequency=", method!(RbTrainer::word_piece_trainer_set_min_frequency, 1))?;
737
- class.define_method("show_progress", method!(RbTrainer::word_piece_trainer_show_progress, 0))?;
738
- class.define_method("show_progress=", method!(RbTrainer::word_piece_trainer_set_show_progress, 1))?;
739
- class.define_method("special_tokens", method!(RbTrainer::word_piece_trainer_special_tokens, 0))?;
740
- class.define_method("special_tokens=", method!(RbTrainer::word_piece_trainer_set_special_tokens, 1))?;
741
- class.define_method("limit_alphabet", method!(RbTrainer::word_piece_trainer_limit_alphabet, 0))?;
742
- class.define_method("limit_alphabet=", method!(RbTrainer::word_piece_trainer_set_limit_alphabet, 1))?;
743
- class.define_method("initial_alphabet", method!(RbTrainer::word_piece_trainer_initial_alphabet, 0))?;
744
- class.define_method("initial_alphabet=", method!(RbTrainer::word_piece_trainer_set_initial_alphabet, 1))?;
745
- class.define_method("continuing_subword_prefix", method!(RbTrainer::word_piece_trainer_continuing_subword_prefix, 0))?;
746
- class.define_method("continuing_subword_prefix=", method!(RbTrainer::word_piece_trainer_set_continuing_subword_prefix, 1))?;
747
- class.define_method("end_of_word_suffix", method!(RbTrainer::word_piece_trainer_end_of_word_suffix, 0))?;
748
- class.define_method("end_of_word_suffix=", method!(RbTrainer::word_piece_trainer_set_end_of_word_suffix, 1))?;
841
+ class.define_method(
842
+ "vocab_size",
843
+ method!(RbTrainer::word_piece_trainer_vocab_size, 0),
844
+ )?;
845
+ class.define_method(
846
+ "vocab_size=",
847
+ method!(RbTrainer::word_piece_trainer_set_vocab_size, 1),
848
+ )?;
849
+ class.define_method(
850
+ "min_frequency",
851
+ method!(RbTrainer::word_piece_trainer_min_frequency, 0),
852
+ )?;
853
+ class.define_method(
854
+ "min_frequency=",
855
+ method!(RbTrainer::word_piece_trainer_set_min_frequency, 1),
856
+ )?;
857
+ class.define_method(
858
+ "show_progress",
859
+ method!(RbTrainer::word_piece_trainer_show_progress, 0),
860
+ )?;
861
+ class.define_method(
862
+ "show_progress=",
863
+ method!(RbTrainer::word_piece_trainer_set_show_progress, 1),
864
+ )?;
865
+ class.define_method(
866
+ "special_tokens",
867
+ method!(RbTrainer::word_piece_trainer_special_tokens, 0),
868
+ )?;
869
+ class.define_method(
870
+ "special_tokens=",
871
+ method!(RbTrainer::word_piece_trainer_set_special_tokens, 1),
872
+ )?;
873
+ class.define_method(
874
+ "limit_alphabet",
875
+ method!(RbTrainer::word_piece_trainer_limit_alphabet, 0),
876
+ )?;
877
+ class.define_method(
878
+ "limit_alphabet=",
879
+ method!(RbTrainer::word_piece_trainer_set_limit_alphabet, 1),
880
+ )?;
881
+ class.define_method(
882
+ "initial_alphabet",
883
+ method!(RbTrainer::word_piece_trainer_initial_alphabet, 0),
884
+ )?;
885
+ class.define_method(
886
+ "initial_alphabet=",
887
+ method!(RbTrainer::word_piece_trainer_set_initial_alphabet, 1),
888
+ )?;
889
+ class.define_method(
890
+ "continuing_subword_prefix",
891
+ method!(RbTrainer::word_piece_trainer_continuing_subword_prefix, 0),
892
+ )?;
893
+ class.define_method(
894
+ "continuing_subword_prefix=",
895
+ method!(
896
+ RbTrainer::word_piece_trainer_set_continuing_subword_prefix,
897
+ 1
898
+ ),
899
+ )?;
900
+ class.define_method(
901
+ "end_of_word_suffix",
902
+ method!(RbTrainer::word_piece_trainer_end_of_word_suffix, 0),
903
+ )?;
904
+ class.define_method(
905
+ "end_of_word_suffix=",
906
+ method!(RbTrainer::word_piece_trainer_set_end_of_word_suffix, 1),
907
+ )?;
749
908
 
750
909
  Ok(())
751
910
  }
@@ -1,6 +1,6 @@
1
- use onig::Regex;
2
- use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
3
1
  use crate::{RbResult, TOKENIZERS};
2
+ use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
3
+ use onig::Regex;
4
4
 
5
5
  #[magnus::wrap(class = "Tokenizers::Regex")]
6
6
  pub struct RbRegex {
@@ -11,13 +11,15 @@ pub struct RbRegex {
11
11
  impl RbRegex {
12
12
  pub fn new(s: String) -> RbResult<Self> {
13
13
  Ok(Self {
14
- inner: Regex::new(&s).map_err(|e| Error::new(exception::runtime_error(), e.description().to_owned()))?,
14
+ inner: Regex::new(&s)
15
+ .map_err(|e| Error::new(exception::runtime_error(), e.description().to_owned()))?,
15
16
  pattern: s,
16
17
  })
17
18
  }
18
19
  }
19
20
 
20
- static REGEX: Lazy<RClass> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Regex").unwrap());
21
+ static REGEX: Lazy<RClass> =
22
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Regex").unwrap());
21
23
 
22
24
  pub fn regex() -> RClass {
23
25
  Ruby::get().unwrap().get_inner(&REGEX)
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.20.0"
4
+ TOKENIZERS_VERSION = "0.21.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.3"
2
+ VERSION = "0.5.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-09-17 00:00:00.000000000 Z
10
+ date: 2024-12-29 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rb_sys
@@ -24,7 +23,6 @@ dependencies:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
25
  version: '0'
27
- description:
28
26
  email: andrew@ankane.org
29
27
  executables: []
30
28
  extensions:
@@ -86,7 +84,6 @@ homepage: https://github.com/ankane/tokenizers-ruby
86
84
  licenses:
87
85
  - Apache-2.0
88
86
  metadata: {}
89
- post_install_message:
90
87
  rdoc_options: []
91
88
  require_paths:
92
89
  - lib
@@ -101,8 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
101
98
  - !ruby/object:Gem::Version
102
99
  version: '0'
103
100
  requirements: []
104
- rubygems_version: 3.5.16
105
- signing_key:
101
+ rubygems_version: 3.6.2
106
102
  specification_version: 4
107
103
  summary: Fast state-of-the-art tokenizers for Ruby
108
104
  test_files: []