tokenizers 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,8 +5,9 @@ use crate::models::RbModel;
5
5
  use crate::tokenizer::RbAddedToken;
6
6
  use magnus::prelude::*;
7
7
  use magnus::{
8
- data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
9
- RArray, RClass, RHash, RModule, Ruby, Symbol, TryConvert, TypedData, Value,
8
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType,
9
+ DataTypeFunctions, Error, Module, Object, RArray, RClass, RHash, RModule, Ruby, Symbol,
10
+ TryConvert, TypedData, Value,
10
11
  };
11
12
  use serde::{Deserialize, Serialize};
12
13
  use tk::models::TrainerWrapper;
@@ -68,7 +69,6 @@ macro_rules! setter {
68
69
  }
69
70
 
70
71
  impl RbTrainer {
71
-
72
72
  fn bpe_trainer_vocab_size(&self) -> usize {
73
73
  getter!(self, BpeTrainer, vocab_size)
74
74
  }
@@ -525,7 +525,9 @@ impl RbUnigramTrainer {
525
525
  return Err(Error::new(exception::arg_error(), "unknown keyword"));
526
526
  }
527
527
 
528
- let trainer = builder.build().map_err(|_| { Error::new(exception::arg_error(), "Cannot build UnigramTrainer") })?;
528
+ let trainer = builder
529
+ .build()
530
+ .map_err(|_| Error::new(exception::arg_error(), "Cannot build UnigramTrainer"))?;
529
531
  Ok(trainer.into())
530
532
  }
531
533
  }
@@ -567,7 +569,10 @@ impl RbWordLevelTrainer {
567
569
  builder.show_progress(TryConvert::try_convert(value)?);
568
570
  }
569
571
 
570
- Ok(builder.build().expect("WordLevelTrainerBuilder cannot fail").into())
572
+ Ok(builder
573
+ .build()
574
+ .expect("WordLevelTrainerBuilder cannot fail")
575
+ .into())
571
576
  }
572
577
  }
573
578
 
@@ -650,7 +655,8 @@ unsafe impl TypedData for RbTrainer {
650
655
  }
651
656
 
652
657
  fn data_type() -> &'static DataType {
653
- static DATA_TYPE: DataType = data_type_builder!(RbTrainer, "Tokenizers::Trainers::Trainer").build();
658
+ static DATA_TYPE: DataType =
659
+ data_type_builder!(RbTrainer, "Tokenizers::Trainers::Trainer").build();
654
660
  &DATA_TYPE
655
661
  }
656
662
 
@@ -661,17 +667,26 @@ unsafe impl TypedData for RbTrainer {
661
667
  class
662
668
  });
663
669
  static UNIGRAM_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
664
- let class: RClass = ruby.get_inner(&TRAINERS).const_get("UnigramTrainer").unwrap();
670
+ let class: RClass = ruby
671
+ .get_inner(&TRAINERS)
672
+ .const_get("UnigramTrainer")
673
+ .unwrap();
665
674
  class.undef_default_alloc_func();
666
675
  class
667
676
  });
668
677
  static WORD_LEVEL_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
669
- let class: RClass = ruby.get_inner(&TRAINERS).const_get("WordLevelTrainer").unwrap();
678
+ let class: RClass = ruby
679
+ .get_inner(&TRAINERS)
680
+ .const_get("WordLevelTrainer")
681
+ .unwrap();
670
682
  class.undef_default_alloc_func();
671
683
  class
672
684
  });
673
685
  static WORD_PIECE_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
674
- let class: RClass = ruby.get_inner(&TRAINERS).const_get("WordPieceTrainer").unwrap();
686
+ let class: RClass = ruby
687
+ .get_inner(&TRAINERS)
688
+ .const_get("WordPieceTrainer")
689
+ .unwrap();
675
690
  class.undef_default_alloc_func();
676
691
  class
677
692
  });
@@ -690,62 +705,206 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
690
705
  let class = module.define_class("BpeTrainer", trainer)?;
691
706
  class.define_singleton_method("_new", function!(RbBpeTrainer::new, 1))?;
692
707
  class.define_method("vocab_size", method!(RbTrainer::bpe_trainer_vocab_size, 0))?;
693
- class.define_method("vocab_size=", method!(RbTrainer::bpe_trainer_set_vocab_size, 1))?;
694
- class.define_method("min_frequency", method!(RbTrainer::bpe_trainer_min_frequency, 0))?;
695
- class.define_method("min_frequency=", method!(RbTrainer::bpe_trainer_set_min_frequency, 1))?;
696
- class.define_method("show_progress", method!(RbTrainer::bpe_trainer_show_progress, 0))?;
697
- class.define_method("show_progress=", method!(RbTrainer::bpe_trainer_set_show_progress, 1))?;
698
- class.define_method("special_tokens", method!(RbTrainer::bpe_trainer_special_tokens, 0))?;
699
- class.define_method("special_tokens=", method!(RbTrainer::bpe_trainer_set_special_tokens, 1))?;
700
- class.define_method("limit_alphabet", method!(RbTrainer::bpe_trainer_limit_alphabet, 0))?;
701
- class.define_method("limit_alphabet=", method!(RbTrainer::bpe_trainer_set_limit_alphabet, 1))?;
702
- class.define_method("initial_alphabet", method!(RbTrainer::bpe_trainer_initial_alphabet, 0))?;
703
- class.define_method("initial_alphabet=", method!(RbTrainer::bpe_trainer_set_initial_alphabet, 1))?;
704
- class.define_method("continuing_subword_prefix", method!(RbTrainer::bpe_trainer_continuing_subword_prefix, 0))?;
705
- class.define_method("continuing_subword_prefix=", method!(RbTrainer::bpe_trainer_set_continuing_subword_prefix, 1))?;
706
- class.define_method("end_of_word_suffix", method!(RbTrainer::bpe_trainer_end_of_word_suffix, 0))?;
707
- class.define_method("end_of_word_suffix=", method!(RbTrainer::bpe_trainer_set_end_of_word_suffix, 1))?;
708
+ class.define_method(
709
+ "vocab_size=",
710
+ method!(RbTrainer::bpe_trainer_set_vocab_size, 1),
711
+ )?;
712
+ class.define_method(
713
+ "min_frequency",
714
+ method!(RbTrainer::bpe_trainer_min_frequency, 0),
715
+ )?;
716
+ class.define_method(
717
+ "min_frequency=",
718
+ method!(RbTrainer::bpe_trainer_set_min_frequency, 1),
719
+ )?;
720
+ class.define_method(
721
+ "show_progress",
722
+ method!(RbTrainer::bpe_trainer_show_progress, 0),
723
+ )?;
724
+ class.define_method(
725
+ "show_progress=",
726
+ method!(RbTrainer::bpe_trainer_set_show_progress, 1),
727
+ )?;
728
+ class.define_method(
729
+ "special_tokens",
730
+ method!(RbTrainer::bpe_trainer_special_tokens, 0),
731
+ )?;
732
+ class.define_method(
733
+ "special_tokens=",
734
+ method!(RbTrainer::bpe_trainer_set_special_tokens, 1),
735
+ )?;
736
+ class.define_method(
737
+ "limit_alphabet",
738
+ method!(RbTrainer::bpe_trainer_limit_alphabet, 0),
739
+ )?;
740
+ class.define_method(
741
+ "limit_alphabet=",
742
+ method!(RbTrainer::bpe_trainer_set_limit_alphabet, 1),
743
+ )?;
744
+ class.define_method(
745
+ "initial_alphabet",
746
+ method!(RbTrainer::bpe_trainer_initial_alphabet, 0),
747
+ )?;
748
+ class.define_method(
749
+ "initial_alphabet=",
750
+ method!(RbTrainer::bpe_trainer_set_initial_alphabet, 1),
751
+ )?;
752
+ class.define_method(
753
+ "continuing_subword_prefix",
754
+ method!(RbTrainer::bpe_trainer_continuing_subword_prefix, 0),
755
+ )?;
756
+ class.define_method(
757
+ "continuing_subword_prefix=",
758
+ method!(RbTrainer::bpe_trainer_set_continuing_subword_prefix, 1),
759
+ )?;
760
+ class.define_method(
761
+ "end_of_word_suffix",
762
+ method!(RbTrainer::bpe_trainer_end_of_word_suffix, 0),
763
+ )?;
764
+ class.define_method(
765
+ "end_of_word_suffix=",
766
+ method!(RbTrainer::bpe_trainer_set_end_of_word_suffix, 1),
767
+ )?;
708
768
 
709
769
  let class = module.define_class("UnigramTrainer", trainer)?;
710
770
  class.define_singleton_method("_new", function!(RbUnigramTrainer::new, 1))?;
711
- class.define_method("vocab_size", method!(RbTrainer::unigram_trainer_vocab_size, 0))?;
712
- class.define_method("vocab_size=", method!(RbTrainer::unigram_trainer_set_vocab_size, 1))?;
713
- class.define_method("show_progress", method!(RbTrainer::unigram_trainer_show_progress, 0))?;
714
- class.define_method("show_progress=", method!(RbTrainer::unigram_trainer_set_show_progress, 1))?;
715
- class.define_method("special_tokens", method!(RbTrainer::unigram_trainer_special_tokens, 0))?;
716
- class.define_method("special_tokens=", method!(RbTrainer::unigram_trainer_set_special_tokens, 1))?;
717
- class.define_method("initial_alphabet", method!(RbTrainer::unigram_trainer_initial_alphabet, 0))?;
718
- class.define_method("initial_alphabet=", method!(RbTrainer::unigram_trainer_set_initial_alphabet, 1))?;
771
+ class.define_method(
772
+ "vocab_size",
773
+ method!(RbTrainer::unigram_trainer_vocab_size, 0),
774
+ )?;
775
+ class.define_method(
776
+ "vocab_size=",
777
+ method!(RbTrainer::unigram_trainer_set_vocab_size, 1),
778
+ )?;
779
+ class.define_method(
780
+ "show_progress",
781
+ method!(RbTrainer::unigram_trainer_show_progress, 0),
782
+ )?;
783
+ class.define_method(
784
+ "show_progress=",
785
+ method!(RbTrainer::unigram_trainer_set_show_progress, 1),
786
+ )?;
787
+ class.define_method(
788
+ "special_tokens",
789
+ method!(RbTrainer::unigram_trainer_special_tokens, 0),
790
+ )?;
791
+ class.define_method(
792
+ "special_tokens=",
793
+ method!(RbTrainer::unigram_trainer_set_special_tokens, 1),
794
+ )?;
795
+ class.define_method(
796
+ "initial_alphabet",
797
+ method!(RbTrainer::unigram_trainer_initial_alphabet, 0),
798
+ )?;
799
+ class.define_method(
800
+ "initial_alphabet=",
801
+ method!(RbTrainer::unigram_trainer_set_initial_alphabet, 1),
802
+ )?;
719
803
 
720
804
  let class = module.define_class("WordLevelTrainer", trainer)?;
721
805
  class.define_singleton_method("_new", function!(RbWordLevelTrainer::new, 1))?;
722
- class.define_method("vocab_size", method!(RbTrainer::word_level_trainer_vocab_size, 0))?;
723
- class.define_method("vocab_size=", method!(RbTrainer::word_level_trainer_set_vocab_size, 1))?;
724
- class.define_method("min_frequency", method!(RbTrainer::word_level_trainer_min_frequency, 0))?;
725
- class.define_method("min_frequency=", method!(RbTrainer::word_level_trainer_set_min_frequency, 1))?;
726
- class.define_method("show_progress", method!(RbTrainer::word_level_trainer_show_progress, 0))?;
727
- class.define_method("show_progress=", method!(RbTrainer::word_level_trainer_set_show_progress, 1))?;
728
- class.define_method("special_tokens", method!(RbTrainer::word_level_trainer_special_tokens, 0))?;
729
- class.define_method("special_tokens=", method!(RbTrainer::word_level_trainer_set_special_tokens, 1))?;
806
+ class.define_method(
807
+ "vocab_size",
808
+ method!(RbTrainer::word_level_trainer_vocab_size, 0),
809
+ )?;
810
+ class.define_method(
811
+ "vocab_size=",
812
+ method!(RbTrainer::word_level_trainer_set_vocab_size, 1),
813
+ )?;
814
+ class.define_method(
815
+ "min_frequency",
816
+ method!(RbTrainer::word_level_trainer_min_frequency, 0),
817
+ )?;
818
+ class.define_method(
819
+ "min_frequency=",
820
+ method!(RbTrainer::word_level_trainer_set_min_frequency, 1),
821
+ )?;
822
+ class.define_method(
823
+ "show_progress",
824
+ method!(RbTrainer::word_level_trainer_show_progress, 0),
825
+ )?;
826
+ class.define_method(
827
+ "show_progress=",
828
+ method!(RbTrainer::word_level_trainer_set_show_progress, 1),
829
+ )?;
830
+ class.define_method(
831
+ "special_tokens",
832
+ method!(RbTrainer::word_level_trainer_special_tokens, 0),
833
+ )?;
834
+ class.define_method(
835
+ "special_tokens=",
836
+ method!(RbTrainer::word_level_trainer_set_special_tokens, 1),
837
+ )?;
730
838
 
731
839
  let class = module.define_class("WordPieceTrainer", trainer)?;
732
840
  class.define_singleton_method("_new", function!(RbWordPieceTrainer::new, 1))?;
733
- class.define_method("vocab_size", method!(RbTrainer::word_piece_trainer_vocab_size, 0))?;
734
- class.define_method("vocab_size=", method!(RbTrainer::word_piece_trainer_set_vocab_size, 1))?;
735
- class.define_method("min_frequency", method!(RbTrainer::word_piece_trainer_min_frequency, 0))?;
736
- class.define_method("min_frequency=", method!(RbTrainer::word_piece_trainer_set_min_frequency, 1))?;
737
- class.define_method("show_progress", method!(RbTrainer::word_piece_trainer_show_progress, 0))?;
738
- class.define_method("show_progress=", method!(RbTrainer::word_piece_trainer_set_show_progress, 1))?;
739
- class.define_method("special_tokens", method!(RbTrainer::word_piece_trainer_special_tokens, 0))?;
740
- class.define_method("special_tokens=", method!(RbTrainer::word_piece_trainer_set_special_tokens, 1))?;
741
- class.define_method("limit_alphabet", method!(RbTrainer::word_piece_trainer_limit_alphabet, 0))?;
742
- class.define_method("limit_alphabet=", method!(RbTrainer::word_piece_trainer_set_limit_alphabet, 1))?;
743
- class.define_method("initial_alphabet", method!(RbTrainer::word_piece_trainer_initial_alphabet, 0))?;
744
- class.define_method("initial_alphabet=", method!(RbTrainer::word_piece_trainer_set_initial_alphabet, 1))?;
745
- class.define_method("continuing_subword_prefix", method!(RbTrainer::word_piece_trainer_continuing_subword_prefix, 0))?;
746
- class.define_method("continuing_subword_prefix=", method!(RbTrainer::word_piece_trainer_set_continuing_subword_prefix, 1))?;
747
- class.define_method("end_of_word_suffix", method!(RbTrainer::word_piece_trainer_end_of_word_suffix, 0))?;
748
- class.define_method("end_of_word_suffix=", method!(RbTrainer::word_piece_trainer_set_end_of_word_suffix, 1))?;
841
+ class.define_method(
842
+ "vocab_size",
843
+ method!(RbTrainer::word_piece_trainer_vocab_size, 0),
844
+ )?;
845
+ class.define_method(
846
+ "vocab_size=",
847
+ method!(RbTrainer::word_piece_trainer_set_vocab_size, 1),
848
+ )?;
849
+ class.define_method(
850
+ "min_frequency",
851
+ method!(RbTrainer::word_piece_trainer_min_frequency, 0),
852
+ )?;
853
+ class.define_method(
854
+ "min_frequency=",
855
+ method!(RbTrainer::word_piece_trainer_set_min_frequency, 1),
856
+ )?;
857
+ class.define_method(
858
+ "show_progress",
859
+ method!(RbTrainer::word_piece_trainer_show_progress, 0),
860
+ )?;
861
+ class.define_method(
862
+ "show_progress=",
863
+ method!(RbTrainer::word_piece_trainer_set_show_progress, 1),
864
+ )?;
865
+ class.define_method(
866
+ "special_tokens",
867
+ method!(RbTrainer::word_piece_trainer_special_tokens, 0),
868
+ )?;
869
+ class.define_method(
870
+ "special_tokens=",
871
+ method!(RbTrainer::word_piece_trainer_set_special_tokens, 1),
872
+ )?;
873
+ class.define_method(
874
+ "limit_alphabet",
875
+ method!(RbTrainer::word_piece_trainer_limit_alphabet, 0),
876
+ )?;
877
+ class.define_method(
878
+ "limit_alphabet=",
879
+ method!(RbTrainer::word_piece_trainer_set_limit_alphabet, 1),
880
+ )?;
881
+ class.define_method(
882
+ "initial_alphabet",
883
+ method!(RbTrainer::word_piece_trainer_initial_alphabet, 0),
884
+ )?;
885
+ class.define_method(
886
+ "initial_alphabet=",
887
+ method!(RbTrainer::word_piece_trainer_set_initial_alphabet, 1),
888
+ )?;
889
+ class.define_method(
890
+ "continuing_subword_prefix",
891
+ method!(RbTrainer::word_piece_trainer_continuing_subword_prefix, 0),
892
+ )?;
893
+ class.define_method(
894
+ "continuing_subword_prefix=",
895
+ method!(
896
+ RbTrainer::word_piece_trainer_set_continuing_subword_prefix,
897
+ 1
898
+ ),
899
+ )?;
900
+ class.define_method(
901
+ "end_of_word_suffix",
902
+ method!(RbTrainer::word_piece_trainer_end_of_word_suffix, 0),
903
+ )?;
904
+ class.define_method(
905
+ "end_of_word_suffix=",
906
+ method!(RbTrainer::word_piece_trainer_set_end_of_word_suffix, 1),
907
+ )?;
749
908
 
750
909
  Ok(())
751
910
  }
@@ -1,6 +1,6 @@
1
- use onig::Regex;
2
- use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
3
1
  use crate::{RbResult, TOKENIZERS};
2
+ use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
3
+ use onig::Regex;
4
4
 
5
5
  #[magnus::wrap(class = "Tokenizers::Regex")]
6
6
  pub struct RbRegex {
@@ -11,13 +11,15 @@ pub struct RbRegex {
11
11
  impl RbRegex {
12
12
  pub fn new(s: String) -> RbResult<Self> {
13
13
  Ok(Self {
14
- inner: Regex::new(&s).map_err(|e| Error::new(exception::runtime_error(), e.description().to_owned()))?,
14
+ inner: Regex::new(&s)
15
+ .map_err(|e| Error::new(exception::runtime_error(), e.description().to_owned()))?,
15
16
  pattern: s,
16
17
  })
17
18
  }
18
19
  }
19
20
 
20
- static REGEX: Lazy<RClass> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Regex").unwrap());
21
+ static REGEX: Lazy<RClass> =
22
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Regex").unwrap());
21
23
 
22
24
  pub fn regex() -> RClass {
23
25
  Ruby::get().unwrap().get_inner(&REGEX)
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.20.0"
4
+ TOKENIZERS_VERSION = "0.21.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.3"
2
+ VERSION = "0.5.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-09-17 00:00:00.000000000 Z
10
+ date: 2024-12-29 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rb_sys
@@ -24,7 +23,6 @@ dependencies:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
25
  version: '0'
27
- description:
28
26
  email: andrew@ankane.org
29
27
  executables: []
30
28
  extensions:
@@ -86,7 +84,6 @@ homepage: https://github.com/ankane/tokenizers-ruby
86
84
  licenses:
87
85
  - Apache-2.0
88
86
  metadata: {}
89
- post_install_message:
90
87
  rdoc_options: []
91
88
  require_paths:
92
89
  - lib
@@ -101,8 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
101
98
  - !ruby/object:Gem::Version
102
99
  version: '0'
103
100
  requirements: []
104
- rubygems_version: 3.5.16
105
- signing_key:
101
+ rubygems_version: 3.6.2
106
102
  specification_version: 4
107
103
  summary: Fast state-of-the-art tokenizers for Ruby
108
104
  test_files: []