tokenizers 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +154 -83
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/decoders.rs +32 -14
- data/ext/tokenizers/src/error.rs +6 -1
- data/ext/tokenizers/src/lib.rs +37 -12
- data/ext/tokenizers/src/models.rs +75 -23
- data/ext/tokenizers/src/normalizers.rs +84 -24
- data/ext/tokenizers/src/pre_tokenizers.rs +121 -42
- data/ext/tokenizers/src/processors.rs +22 -10
- data/ext/tokenizers/src/tokenizer.rs +63 -34
- data/ext/tokenizers/src/trainers.rs +215 -56
- data/ext/tokenizers/src/utils/regex.rs +6 -4
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -7
@@ -5,8 +5,9 @@ use crate::models::RbModel;
|
|
5
5
|
use crate::tokenizer::RbAddedToken;
|
6
6
|
use magnus::prelude::*;
|
7
7
|
use magnus::{
|
8
|
-
data_type_builder, exception, function, method, value::Lazy, Class, DataType,
|
9
|
-
RArray, RClass, RHash, RModule, Ruby, Symbol,
|
8
|
+
data_type_builder, exception, function, method, value::Lazy, Class, DataType,
|
9
|
+
DataTypeFunctions, Error, Module, Object, RArray, RClass, RHash, RModule, Ruby, Symbol,
|
10
|
+
TryConvert, TypedData, Value,
|
10
11
|
};
|
11
12
|
use serde::{Deserialize, Serialize};
|
12
13
|
use tk::models::TrainerWrapper;
|
@@ -68,7 +69,6 @@ macro_rules! setter {
|
|
68
69
|
}
|
69
70
|
|
70
71
|
impl RbTrainer {
|
71
|
-
|
72
72
|
fn bpe_trainer_vocab_size(&self) -> usize {
|
73
73
|
getter!(self, BpeTrainer, vocab_size)
|
74
74
|
}
|
@@ -525,7 +525,9 @@ impl RbUnigramTrainer {
|
|
525
525
|
return Err(Error::new(exception::arg_error(), "unknown keyword"));
|
526
526
|
}
|
527
527
|
|
528
|
-
let trainer = builder
|
528
|
+
let trainer = builder
|
529
|
+
.build()
|
530
|
+
.map_err(|_| Error::new(exception::arg_error(), "Cannot build UnigramTrainer"))?;
|
529
531
|
Ok(trainer.into())
|
530
532
|
}
|
531
533
|
}
|
@@ -567,7 +569,10 @@ impl RbWordLevelTrainer {
|
|
567
569
|
builder.show_progress(TryConvert::try_convert(value)?);
|
568
570
|
}
|
569
571
|
|
570
|
-
Ok(builder
|
572
|
+
Ok(builder
|
573
|
+
.build()
|
574
|
+
.expect("WordLevelTrainerBuilder cannot fail")
|
575
|
+
.into())
|
571
576
|
}
|
572
577
|
}
|
573
578
|
|
@@ -650,7 +655,8 @@ unsafe impl TypedData for RbTrainer {
|
|
650
655
|
}
|
651
656
|
|
652
657
|
fn data_type() -> &'static DataType {
|
653
|
-
static DATA_TYPE: DataType =
|
658
|
+
static DATA_TYPE: DataType =
|
659
|
+
data_type_builder!(RbTrainer, "Tokenizers::Trainers::Trainer").build();
|
654
660
|
&DATA_TYPE
|
655
661
|
}
|
656
662
|
|
@@ -661,17 +667,26 @@ unsafe impl TypedData for RbTrainer {
|
|
661
667
|
class
|
662
668
|
});
|
663
669
|
static UNIGRAM_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
|
664
|
-
let class: RClass = ruby
|
670
|
+
let class: RClass = ruby
|
671
|
+
.get_inner(&TRAINERS)
|
672
|
+
.const_get("UnigramTrainer")
|
673
|
+
.unwrap();
|
665
674
|
class.undef_default_alloc_func();
|
666
675
|
class
|
667
676
|
});
|
668
677
|
static WORD_LEVEL_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
|
669
|
-
let class: RClass = ruby
|
678
|
+
let class: RClass = ruby
|
679
|
+
.get_inner(&TRAINERS)
|
680
|
+
.const_get("WordLevelTrainer")
|
681
|
+
.unwrap();
|
670
682
|
class.undef_default_alloc_func();
|
671
683
|
class
|
672
684
|
});
|
673
685
|
static WORD_PIECE_TRAINER: Lazy<RClass> = Lazy::new(|ruby| {
|
674
|
-
let class: RClass = ruby
|
686
|
+
let class: RClass = ruby
|
687
|
+
.get_inner(&TRAINERS)
|
688
|
+
.const_get("WordPieceTrainer")
|
689
|
+
.unwrap();
|
675
690
|
class.undef_default_alloc_func();
|
676
691
|
class
|
677
692
|
});
|
@@ -690,62 +705,206 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
690
705
|
let class = module.define_class("BpeTrainer", trainer)?;
|
691
706
|
class.define_singleton_method("_new", function!(RbBpeTrainer::new, 1))?;
|
692
707
|
class.define_method("vocab_size", method!(RbTrainer::bpe_trainer_vocab_size, 0))?;
|
693
|
-
class.define_method(
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
class.define_method(
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
class.define_method(
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
class.define_method(
|
706
|
-
|
707
|
-
|
708
|
+
class.define_method(
|
709
|
+
"vocab_size=",
|
710
|
+
method!(RbTrainer::bpe_trainer_set_vocab_size, 1),
|
711
|
+
)?;
|
712
|
+
class.define_method(
|
713
|
+
"min_frequency",
|
714
|
+
method!(RbTrainer::bpe_trainer_min_frequency, 0),
|
715
|
+
)?;
|
716
|
+
class.define_method(
|
717
|
+
"min_frequency=",
|
718
|
+
method!(RbTrainer::bpe_trainer_set_min_frequency, 1),
|
719
|
+
)?;
|
720
|
+
class.define_method(
|
721
|
+
"show_progress",
|
722
|
+
method!(RbTrainer::bpe_trainer_show_progress, 0),
|
723
|
+
)?;
|
724
|
+
class.define_method(
|
725
|
+
"show_progress=",
|
726
|
+
method!(RbTrainer::bpe_trainer_set_show_progress, 1),
|
727
|
+
)?;
|
728
|
+
class.define_method(
|
729
|
+
"special_tokens",
|
730
|
+
method!(RbTrainer::bpe_trainer_special_tokens, 0),
|
731
|
+
)?;
|
732
|
+
class.define_method(
|
733
|
+
"special_tokens=",
|
734
|
+
method!(RbTrainer::bpe_trainer_set_special_tokens, 1),
|
735
|
+
)?;
|
736
|
+
class.define_method(
|
737
|
+
"limit_alphabet",
|
738
|
+
method!(RbTrainer::bpe_trainer_limit_alphabet, 0),
|
739
|
+
)?;
|
740
|
+
class.define_method(
|
741
|
+
"limit_alphabet=",
|
742
|
+
method!(RbTrainer::bpe_trainer_set_limit_alphabet, 1),
|
743
|
+
)?;
|
744
|
+
class.define_method(
|
745
|
+
"initial_alphabet",
|
746
|
+
method!(RbTrainer::bpe_trainer_initial_alphabet, 0),
|
747
|
+
)?;
|
748
|
+
class.define_method(
|
749
|
+
"initial_alphabet=",
|
750
|
+
method!(RbTrainer::bpe_trainer_set_initial_alphabet, 1),
|
751
|
+
)?;
|
752
|
+
class.define_method(
|
753
|
+
"continuing_subword_prefix",
|
754
|
+
method!(RbTrainer::bpe_trainer_continuing_subword_prefix, 0),
|
755
|
+
)?;
|
756
|
+
class.define_method(
|
757
|
+
"continuing_subword_prefix=",
|
758
|
+
method!(RbTrainer::bpe_trainer_set_continuing_subword_prefix, 1),
|
759
|
+
)?;
|
760
|
+
class.define_method(
|
761
|
+
"end_of_word_suffix",
|
762
|
+
method!(RbTrainer::bpe_trainer_end_of_word_suffix, 0),
|
763
|
+
)?;
|
764
|
+
class.define_method(
|
765
|
+
"end_of_word_suffix=",
|
766
|
+
method!(RbTrainer::bpe_trainer_set_end_of_word_suffix, 1),
|
767
|
+
)?;
|
708
768
|
|
709
769
|
let class = module.define_class("UnigramTrainer", trainer)?;
|
710
770
|
class.define_singleton_method("_new", function!(RbUnigramTrainer::new, 1))?;
|
711
|
-
class.define_method(
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
class.define_method(
|
716
|
-
|
717
|
-
|
718
|
-
|
771
|
+
class.define_method(
|
772
|
+
"vocab_size",
|
773
|
+
method!(RbTrainer::unigram_trainer_vocab_size, 0),
|
774
|
+
)?;
|
775
|
+
class.define_method(
|
776
|
+
"vocab_size=",
|
777
|
+
method!(RbTrainer::unigram_trainer_set_vocab_size, 1),
|
778
|
+
)?;
|
779
|
+
class.define_method(
|
780
|
+
"show_progress",
|
781
|
+
method!(RbTrainer::unigram_trainer_show_progress, 0),
|
782
|
+
)?;
|
783
|
+
class.define_method(
|
784
|
+
"show_progress=",
|
785
|
+
method!(RbTrainer::unigram_trainer_set_show_progress, 1),
|
786
|
+
)?;
|
787
|
+
class.define_method(
|
788
|
+
"special_tokens",
|
789
|
+
method!(RbTrainer::unigram_trainer_special_tokens, 0),
|
790
|
+
)?;
|
791
|
+
class.define_method(
|
792
|
+
"special_tokens=",
|
793
|
+
method!(RbTrainer::unigram_trainer_set_special_tokens, 1),
|
794
|
+
)?;
|
795
|
+
class.define_method(
|
796
|
+
"initial_alphabet",
|
797
|
+
method!(RbTrainer::unigram_trainer_initial_alphabet, 0),
|
798
|
+
)?;
|
799
|
+
class.define_method(
|
800
|
+
"initial_alphabet=",
|
801
|
+
method!(RbTrainer::unigram_trainer_set_initial_alphabet, 1),
|
802
|
+
)?;
|
719
803
|
|
720
804
|
let class = module.define_class("WordLevelTrainer", trainer)?;
|
721
805
|
class.define_singleton_method("_new", function!(RbWordLevelTrainer::new, 1))?;
|
722
|
-
class.define_method(
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
class.define_method(
|
727
|
-
|
728
|
-
|
729
|
-
|
806
|
+
class.define_method(
|
807
|
+
"vocab_size",
|
808
|
+
method!(RbTrainer::word_level_trainer_vocab_size, 0),
|
809
|
+
)?;
|
810
|
+
class.define_method(
|
811
|
+
"vocab_size=",
|
812
|
+
method!(RbTrainer::word_level_trainer_set_vocab_size, 1),
|
813
|
+
)?;
|
814
|
+
class.define_method(
|
815
|
+
"min_frequency",
|
816
|
+
method!(RbTrainer::word_level_trainer_min_frequency, 0),
|
817
|
+
)?;
|
818
|
+
class.define_method(
|
819
|
+
"min_frequency=",
|
820
|
+
method!(RbTrainer::word_level_trainer_set_min_frequency, 1),
|
821
|
+
)?;
|
822
|
+
class.define_method(
|
823
|
+
"show_progress",
|
824
|
+
method!(RbTrainer::word_level_trainer_show_progress, 0),
|
825
|
+
)?;
|
826
|
+
class.define_method(
|
827
|
+
"show_progress=",
|
828
|
+
method!(RbTrainer::word_level_trainer_set_show_progress, 1),
|
829
|
+
)?;
|
830
|
+
class.define_method(
|
831
|
+
"special_tokens",
|
832
|
+
method!(RbTrainer::word_level_trainer_special_tokens, 0),
|
833
|
+
)?;
|
834
|
+
class.define_method(
|
835
|
+
"special_tokens=",
|
836
|
+
method!(RbTrainer::word_level_trainer_set_special_tokens, 1),
|
837
|
+
)?;
|
730
838
|
|
731
839
|
let class = module.define_class("WordPieceTrainer", trainer)?;
|
732
840
|
class.define_singleton_method("_new", function!(RbWordPieceTrainer::new, 1))?;
|
733
|
-
class.define_method(
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
class.define_method(
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
class.define_method(
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
class.define_method(
|
746
|
-
|
747
|
-
|
748
|
-
|
841
|
+
class.define_method(
|
842
|
+
"vocab_size",
|
843
|
+
method!(RbTrainer::word_piece_trainer_vocab_size, 0),
|
844
|
+
)?;
|
845
|
+
class.define_method(
|
846
|
+
"vocab_size=",
|
847
|
+
method!(RbTrainer::word_piece_trainer_set_vocab_size, 1),
|
848
|
+
)?;
|
849
|
+
class.define_method(
|
850
|
+
"min_frequency",
|
851
|
+
method!(RbTrainer::word_piece_trainer_min_frequency, 0),
|
852
|
+
)?;
|
853
|
+
class.define_method(
|
854
|
+
"min_frequency=",
|
855
|
+
method!(RbTrainer::word_piece_trainer_set_min_frequency, 1),
|
856
|
+
)?;
|
857
|
+
class.define_method(
|
858
|
+
"show_progress",
|
859
|
+
method!(RbTrainer::word_piece_trainer_show_progress, 0),
|
860
|
+
)?;
|
861
|
+
class.define_method(
|
862
|
+
"show_progress=",
|
863
|
+
method!(RbTrainer::word_piece_trainer_set_show_progress, 1),
|
864
|
+
)?;
|
865
|
+
class.define_method(
|
866
|
+
"special_tokens",
|
867
|
+
method!(RbTrainer::word_piece_trainer_special_tokens, 0),
|
868
|
+
)?;
|
869
|
+
class.define_method(
|
870
|
+
"special_tokens=",
|
871
|
+
method!(RbTrainer::word_piece_trainer_set_special_tokens, 1),
|
872
|
+
)?;
|
873
|
+
class.define_method(
|
874
|
+
"limit_alphabet",
|
875
|
+
method!(RbTrainer::word_piece_trainer_limit_alphabet, 0),
|
876
|
+
)?;
|
877
|
+
class.define_method(
|
878
|
+
"limit_alphabet=",
|
879
|
+
method!(RbTrainer::word_piece_trainer_set_limit_alphabet, 1),
|
880
|
+
)?;
|
881
|
+
class.define_method(
|
882
|
+
"initial_alphabet",
|
883
|
+
method!(RbTrainer::word_piece_trainer_initial_alphabet, 0),
|
884
|
+
)?;
|
885
|
+
class.define_method(
|
886
|
+
"initial_alphabet=",
|
887
|
+
method!(RbTrainer::word_piece_trainer_set_initial_alphabet, 1),
|
888
|
+
)?;
|
889
|
+
class.define_method(
|
890
|
+
"continuing_subword_prefix",
|
891
|
+
method!(RbTrainer::word_piece_trainer_continuing_subword_prefix, 0),
|
892
|
+
)?;
|
893
|
+
class.define_method(
|
894
|
+
"continuing_subword_prefix=",
|
895
|
+
method!(
|
896
|
+
RbTrainer::word_piece_trainer_set_continuing_subword_prefix,
|
897
|
+
1
|
898
|
+
),
|
899
|
+
)?;
|
900
|
+
class.define_method(
|
901
|
+
"end_of_word_suffix",
|
902
|
+
method!(RbTrainer::word_piece_trainer_end_of_word_suffix, 0),
|
903
|
+
)?;
|
904
|
+
class.define_method(
|
905
|
+
"end_of_word_suffix=",
|
906
|
+
method!(RbTrainer::word_piece_trainer_set_end_of_word_suffix, 1),
|
907
|
+
)?;
|
749
908
|
|
750
909
|
Ok(())
|
751
910
|
}
|
@@ -1,6 +1,6 @@
|
|
1
|
-
use onig::Regex;
|
2
|
-
use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
|
3
1
|
use crate::{RbResult, TOKENIZERS};
|
2
|
+
use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
|
3
|
+
use onig::Regex;
|
4
4
|
|
5
5
|
#[magnus::wrap(class = "Tokenizers::Regex")]
|
6
6
|
pub struct RbRegex {
|
@@ -11,13 +11,15 @@ pub struct RbRegex {
|
|
11
11
|
impl RbRegex {
|
12
12
|
pub fn new(s: String) -> RbResult<Self> {
|
13
13
|
Ok(Self {
|
14
|
-
inner: Regex::new(&s)
|
14
|
+
inner: Regex::new(&s)
|
15
|
+
.map_err(|e| Error::new(exception::runtime_error(), e.description().to_owned()))?,
|
15
16
|
pattern: s,
|
16
17
|
})
|
17
18
|
}
|
18
19
|
}
|
19
20
|
|
20
|
-
static REGEX: Lazy<RClass> =
|
21
|
+
static REGEX: Lazy<RClass> =
|
22
|
+
Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Regex").unwrap());
|
21
23
|
|
22
24
|
pub fn regex() -> RClass {
|
23
25
|
Ruby::get().unwrap().get_inner(®EX)
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date: 2024-
|
10
|
+
date: 2024-12-29 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: rb_sys
|
@@ -24,7 +23,6 @@ dependencies:
|
|
24
23
|
- - ">="
|
25
24
|
- !ruby/object:Gem::Version
|
26
25
|
version: '0'
|
27
|
-
description:
|
28
26
|
email: andrew@ankane.org
|
29
27
|
executables: []
|
30
28
|
extensions:
|
@@ -86,7 +84,6 @@ homepage: https://github.com/ankane/tokenizers-ruby
|
|
86
84
|
licenses:
|
87
85
|
- Apache-2.0
|
88
86
|
metadata: {}
|
89
|
-
post_install_message:
|
90
87
|
rdoc_options: []
|
91
88
|
require_paths:
|
92
89
|
- lib
|
@@ -101,8 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
101
98
|
- !ruby/object:Gem::Version
|
102
99
|
version: '0'
|
103
100
|
requirements: []
|
104
|
-
rubygems_version: 3.
|
105
|
-
signing_key:
|
101
|
+
rubygems_version: 3.6.2
|
106
102
|
specification_version: 4
|
107
103
|
summary: Fast state-of-the-art tokenizers for Ruby
|
108
104
|
test_files: []
|