tokenizers 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Cargo.lock +21 -22
- data/ext/tokenizers/Cargo.toml +3 -2
- data/ext/tokenizers/src/decoders.rs +31 -28
- data/ext/tokenizers/src/encoding.rs +42 -11
- data/ext/tokenizers/src/error.rs +10 -5
- data/ext/tokenizers/src/lib.rs +4 -91
- data/ext/tokenizers/src/models.rs +21 -21
- data/ext/tokenizers/src/normalizers.rs +15 -15
- data/ext/tokenizers/src/pre_tokenizers.rs +15 -15
- data/ext/tokenizers/src/processors.rs +145 -15
- data/ext/tokenizers/src/ruby.rs +51 -0
- data/ext/tokenizers/src/tokenizer.rs +381 -244
- data/ext/tokenizers/src/trainers.rs +55 -49
- data/ext/tokenizers/src/utils/normalization.rs +2 -1
- data/ext/tokenizers/src/utils/regex.rs +2 -2
- data/lib/tokenizers/from_pretrained.rb +6 -2
- data/lib/tokenizers/processors/sequence.rb +9 -0
- data/lib/tokenizers/tokenizer.rb +4 -0
- data/lib/tokenizers/version.rb +1 -1
- metadata +5 -3
|
@@ -14,8 +14,8 @@ use tk::Trainer;
|
|
|
14
14
|
use super::{RbResult, TRAINERS};
|
|
15
15
|
|
|
16
16
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
|
17
|
+
#[serde(transparent)]
|
|
17
18
|
pub struct RbTrainer {
|
|
18
|
-
#[serde(flatten)]
|
|
19
19
|
pub trainer: Arc<RwLock<TrainerWrapper>>,
|
|
20
20
|
}
|
|
21
21
|
|
|
@@ -67,7 +67,7 @@ macro_rules! setter {
|
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
impl RbTrainer {
|
|
70
|
-
fn
|
|
70
|
+
fn bpe_trainer_get_vocab_size(&self) -> usize {
|
|
71
71
|
getter!(self, BpeTrainer, vocab_size)
|
|
72
72
|
}
|
|
73
73
|
|
|
@@ -75,7 +75,7 @@ impl RbTrainer {
|
|
|
75
75
|
setter!(self, BpeTrainer, vocab_size, vocab_size);
|
|
76
76
|
}
|
|
77
77
|
|
|
78
|
-
fn
|
|
78
|
+
fn bpe_trainer_get_min_frequency(&self) -> u64 {
|
|
79
79
|
getter!(self, BpeTrainer, min_frequency)
|
|
80
80
|
}
|
|
81
81
|
|
|
@@ -83,7 +83,7 @@ impl RbTrainer {
|
|
|
83
83
|
setter!(self, BpeTrainer, min_frequency, freq);
|
|
84
84
|
}
|
|
85
85
|
|
|
86
|
-
fn
|
|
86
|
+
fn bpe_trainer_get_show_progress(&self) -> bool {
|
|
87
87
|
getter!(self, BpeTrainer, show_progress)
|
|
88
88
|
}
|
|
89
89
|
|
|
@@ -91,7 +91,7 @@ impl RbTrainer {
|
|
|
91
91
|
setter!(self, BpeTrainer, show_progress, show_progress);
|
|
92
92
|
}
|
|
93
93
|
|
|
94
|
-
fn
|
|
94
|
+
fn bpe_trainer_get_special_tokens(&self) -> Vec<String> {
|
|
95
95
|
getter!(
|
|
96
96
|
self,
|
|
97
97
|
BpeTrainer,
|
|
@@ -121,7 +121,7 @@ impl RbTrainer {
|
|
|
121
121
|
Ok(())
|
|
122
122
|
}
|
|
123
123
|
|
|
124
|
-
fn
|
|
124
|
+
fn bpe_trainer_get_limit_alphabet(&self) -> Option<usize> {
|
|
125
125
|
getter!(self, BpeTrainer, limit_alphabet)
|
|
126
126
|
}
|
|
127
127
|
|
|
@@ -129,7 +129,7 @@ impl RbTrainer {
|
|
|
129
129
|
setter!(self, BpeTrainer, limit_alphabet, limit);
|
|
130
130
|
}
|
|
131
131
|
|
|
132
|
-
fn
|
|
132
|
+
fn bpe_trainer_get_initial_alphabet(&self) -> Vec<String> {
|
|
133
133
|
getter!(
|
|
134
134
|
self,
|
|
135
135
|
BpeTrainer,
|
|
@@ -146,7 +146,7 @@ impl RbTrainer {
|
|
|
146
146
|
);
|
|
147
147
|
}
|
|
148
148
|
|
|
149
|
-
fn
|
|
149
|
+
fn bpe_trainer_get_continuing_subword_prefix(&self) -> Option<String> {
|
|
150
150
|
getter!(self, BpeTrainer, continuing_subword_prefix.clone())
|
|
151
151
|
}
|
|
152
152
|
|
|
@@ -154,7 +154,7 @@ impl RbTrainer {
|
|
|
154
154
|
setter!(self, BpeTrainer, continuing_subword_prefix, prefix);
|
|
155
155
|
}
|
|
156
156
|
|
|
157
|
-
fn
|
|
157
|
+
fn bpe_trainer_get_end_of_word_suffix(&self) -> Option<String> {
|
|
158
158
|
getter!(self, BpeTrainer, end_of_word_suffix.clone())
|
|
159
159
|
}
|
|
160
160
|
|
|
@@ -162,7 +162,7 @@ impl RbTrainer {
|
|
|
162
162
|
setter!(self, BpeTrainer, end_of_word_suffix, suffix);
|
|
163
163
|
}
|
|
164
164
|
|
|
165
|
-
fn
|
|
165
|
+
fn unigram_trainer_get_vocab_size(&self) -> u32 {
|
|
166
166
|
getter!(self, UnigramTrainer, vocab_size)
|
|
167
167
|
}
|
|
168
168
|
|
|
@@ -170,7 +170,7 @@ impl RbTrainer {
|
|
|
170
170
|
setter!(self, UnigramTrainer, vocab_size, vocab_size);
|
|
171
171
|
}
|
|
172
172
|
|
|
173
|
-
fn
|
|
173
|
+
fn unigram_trainer_get_show_progress(&self) -> bool {
|
|
174
174
|
getter!(self, UnigramTrainer, show_progress)
|
|
175
175
|
}
|
|
176
176
|
|
|
@@ -178,7 +178,7 @@ impl RbTrainer {
|
|
|
178
178
|
setter!(self, UnigramTrainer, show_progress, show_progress);
|
|
179
179
|
}
|
|
180
180
|
|
|
181
|
-
fn
|
|
181
|
+
fn unigram_trainer_get_special_tokens(&self) -> Vec<String> {
|
|
182
182
|
getter!(
|
|
183
183
|
self,
|
|
184
184
|
UnigramTrainer,
|
|
@@ -208,7 +208,7 @@ impl RbTrainer {
|
|
|
208
208
|
Ok(())
|
|
209
209
|
}
|
|
210
210
|
|
|
211
|
-
fn
|
|
211
|
+
fn unigram_trainer_get_initial_alphabet(&self) -> Vec<String> {
|
|
212
212
|
getter!(
|
|
213
213
|
self,
|
|
214
214
|
UnigramTrainer,
|
|
@@ -225,7 +225,7 @@ impl RbTrainer {
|
|
|
225
225
|
);
|
|
226
226
|
}
|
|
227
227
|
|
|
228
|
-
fn
|
|
228
|
+
fn word_level_trainer_get_vocab_size(&self) -> usize {
|
|
229
229
|
getter!(self, WordLevelTrainer, vocab_size)
|
|
230
230
|
}
|
|
231
231
|
|
|
@@ -233,7 +233,7 @@ impl RbTrainer {
|
|
|
233
233
|
setter!(self, WordLevelTrainer, vocab_size, vocab_size);
|
|
234
234
|
}
|
|
235
235
|
|
|
236
|
-
fn
|
|
236
|
+
fn word_level_trainer_get_min_frequency(&self) -> u64 {
|
|
237
237
|
getter!(self, WordLevelTrainer, min_frequency)
|
|
238
238
|
}
|
|
239
239
|
|
|
@@ -241,7 +241,7 @@ impl RbTrainer {
|
|
|
241
241
|
setter!(self, WordLevelTrainer, min_frequency, freq);
|
|
242
242
|
}
|
|
243
243
|
|
|
244
|
-
fn
|
|
244
|
+
fn word_level_trainer_get_show_progress(&self) -> bool {
|
|
245
245
|
getter!(self, WordLevelTrainer, show_progress)
|
|
246
246
|
}
|
|
247
247
|
|
|
@@ -249,7 +249,7 @@ impl RbTrainer {
|
|
|
249
249
|
setter!(self, WordLevelTrainer, show_progress, show_progress);
|
|
250
250
|
}
|
|
251
251
|
|
|
252
|
-
fn
|
|
252
|
+
fn word_level_trainer_get_special_tokens(&self) -> Vec<String> {
|
|
253
253
|
getter!(
|
|
254
254
|
self,
|
|
255
255
|
WordLevelTrainer,
|
|
@@ -279,7 +279,7 @@ impl RbTrainer {
|
|
|
279
279
|
Ok(())
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
-
fn
|
|
282
|
+
fn word_piece_trainer_get_vocab_size(&self) -> usize {
|
|
283
283
|
getter!(self, WordPieceTrainer, vocab_size())
|
|
284
284
|
}
|
|
285
285
|
|
|
@@ -287,7 +287,7 @@ impl RbTrainer {
|
|
|
287
287
|
setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
|
|
288
288
|
}
|
|
289
289
|
|
|
290
|
-
fn
|
|
290
|
+
fn word_piece_trainer_get_min_frequency(&self) -> u64 {
|
|
291
291
|
getter!(self, WordPieceTrainer, min_frequency())
|
|
292
292
|
}
|
|
293
293
|
|
|
@@ -295,7 +295,7 @@ impl RbTrainer {
|
|
|
295
295
|
setter!(self, WordPieceTrainer, @set_min_frequency, freq);
|
|
296
296
|
}
|
|
297
297
|
|
|
298
|
-
fn
|
|
298
|
+
fn word_piece_trainer_get_show_progress(&self) -> bool {
|
|
299
299
|
getter!(self, WordPieceTrainer, show_progress())
|
|
300
300
|
}
|
|
301
301
|
|
|
@@ -303,7 +303,7 @@ impl RbTrainer {
|
|
|
303
303
|
setter!(self, WordPieceTrainer, @set_show_progress, show_progress);
|
|
304
304
|
}
|
|
305
305
|
|
|
306
|
-
fn
|
|
306
|
+
fn word_piece_trainer_get_special_tokens(&self) -> Vec<String> {
|
|
307
307
|
getter!(
|
|
308
308
|
self,
|
|
309
309
|
WordPieceTrainer,
|
|
@@ -333,7 +333,7 @@ impl RbTrainer {
|
|
|
333
333
|
Ok(())
|
|
334
334
|
}
|
|
335
335
|
|
|
336
|
-
fn
|
|
336
|
+
fn word_piece_trainer_get_limit_alphabet(&self) -> Option<usize> {
|
|
337
337
|
getter!(self, WordPieceTrainer, limit_alphabet())
|
|
338
338
|
}
|
|
339
339
|
|
|
@@ -341,7 +341,7 @@ impl RbTrainer {
|
|
|
341
341
|
setter!(self, WordPieceTrainer, @set_limit_alphabet, limit);
|
|
342
342
|
}
|
|
343
343
|
|
|
344
|
-
fn
|
|
344
|
+
fn word_piece_trainer_get_initial_alphabet(&self) -> Vec<String> {
|
|
345
345
|
getter!(
|
|
346
346
|
self,
|
|
347
347
|
WordPieceTrainer,
|
|
@@ -358,7 +358,7 @@ impl RbTrainer {
|
|
|
358
358
|
);
|
|
359
359
|
}
|
|
360
360
|
|
|
361
|
-
fn
|
|
361
|
+
fn word_piece_trainer_get_continuing_subword_prefix(&self) -> Option<String> {
|
|
362
362
|
getter!(self, WordPieceTrainer, continuing_subword_prefix().clone())
|
|
363
363
|
}
|
|
364
364
|
|
|
@@ -366,7 +366,7 @@ impl RbTrainer {
|
|
|
366
366
|
setter!(self, WordPieceTrainer, @set_continuing_subword_prefix, prefix);
|
|
367
367
|
}
|
|
368
368
|
|
|
369
|
-
fn
|
|
369
|
+
fn word_piece_trainer_get_end_of_word_suffix(&self) -> Option<String> {
|
|
370
370
|
getter!(self, WordPieceTrainer, end_of_word_suffix().clone())
|
|
371
371
|
}
|
|
372
372
|
|
|
@@ -714,14 +714,17 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
714
714
|
|
|
715
715
|
let class = module.define_class("BpeTrainer", trainer)?;
|
|
716
716
|
class.define_singleton_method("_new", function!(RbBpeTrainer::new, 1))?;
|
|
717
|
-
class.define_method(
|
|
717
|
+
class.define_method(
|
|
718
|
+
"vocab_size",
|
|
719
|
+
method!(RbTrainer::bpe_trainer_get_vocab_size, 0),
|
|
720
|
+
)?;
|
|
718
721
|
class.define_method(
|
|
719
722
|
"vocab_size=",
|
|
720
723
|
method!(RbTrainer::bpe_trainer_set_vocab_size, 1),
|
|
721
724
|
)?;
|
|
722
725
|
class.define_method(
|
|
723
726
|
"min_frequency",
|
|
724
|
-
method!(RbTrainer::
|
|
727
|
+
method!(RbTrainer::bpe_trainer_get_min_frequency, 0),
|
|
725
728
|
)?;
|
|
726
729
|
class.define_method(
|
|
727
730
|
"min_frequency=",
|
|
@@ -729,7 +732,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
729
732
|
)?;
|
|
730
733
|
class.define_method(
|
|
731
734
|
"show_progress",
|
|
732
|
-
method!(RbTrainer::
|
|
735
|
+
method!(RbTrainer::bpe_trainer_get_show_progress, 0),
|
|
733
736
|
)?;
|
|
734
737
|
class.define_method(
|
|
735
738
|
"show_progress=",
|
|
@@ -737,7 +740,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
737
740
|
)?;
|
|
738
741
|
class.define_method(
|
|
739
742
|
"special_tokens",
|
|
740
|
-
method!(RbTrainer::
|
|
743
|
+
method!(RbTrainer::bpe_trainer_get_special_tokens, 0),
|
|
741
744
|
)?;
|
|
742
745
|
class.define_method(
|
|
743
746
|
"special_tokens=",
|
|
@@ -745,7 +748,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
745
748
|
)?;
|
|
746
749
|
class.define_method(
|
|
747
750
|
"limit_alphabet",
|
|
748
|
-
method!(RbTrainer::
|
|
751
|
+
method!(RbTrainer::bpe_trainer_get_limit_alphabet, 0),
|
|
749
752
|
)?;
|
|
750
753
|
class.define_method(
|
|
751
754
|
"limit_alphabet=",
|
|
@@ -753,7 +756,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
753
756
|
)?;
|
|
754
757
|
class.define_method(
|
|
755
758
|
"initial_alphabet",
|
|
756
|
-
method!(RbTrainer::
|
|
759
|
+
method!(RbTrainer::bpe_trainer_get_initial_alphabet, 0),
|
|
757
760
|
)?;
|
|
758
761
|
class.define_method(
|
|
759
762
|
"initial_alphabet=",
|
|
@@ -761,7 +764,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
761
764
|
)?;
|
|
762
765
|
class.define_method(
|
|
763
766
|
"continuing_subword_prefix",
|
|
764
|
-
method!(RbTrainer::
|
|
767
|
+
method!(RbTrainer::bpe_trainer_get_continuing_subword_prefix, 0),
|
|
765
768
|
)?;
|
|
766
769
|
class.define_method(
|
|
767
770
|
"continuing_subword_prefix=",
|
|
@@ -769,7 +772,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
769
772
|
)?;
|
|
770
773
|
class.define_method(
|
|
771
774
|
"end_of_word_suffix",
|
|
772
|
-
method!(RbTrainer::
|
|
775
|
+
method!(RbTrainer::bpe_trainer_get_end_of_word_suffix, 0),
|
|
773
776
|
)?;
|
|
774
777
|
class.define_method(
|
|
775
778
|
"end_of_word_suffix=",
|
|
@@ -780,7 +783,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
780
783
|
class.define_singleton_method("_new", function!(RbUnigramTrainer::new, 1))?;
|
|
781
784
|
class.define_method(
|
|
782
785
|
"vocab_size",
|
|
783
|
-
method!(RbTrainer::
|
|
786
|
+
method!(RbTrainer::unigram_trainer_get_vocab_size, 0),
|
|
784
787
|
)?;
|
|
785
788
|
class.define_method(
|
|
786
789
|
"vocab_size=",
|
|
@@ -788,7 +791,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
788
791
|
)?;
|
|
789
792
|
class.define_method(
|
|
790
793
|
"show_progress",
|
|
791
|
-
method!(RbTrainer::
|
|
794
|
+
method!(RbTrainer::unigram_trainer_get_show_progress, 0),
|
|
792
795
|
)?;
|
|
793
796
|
class.define_method(
|
|
794
797
|
"show_progress=",
|
|
@@ -796,7 +799,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
796
799
|
)?;
|
|
797
800
|
class.define_method(
|
|
798
801
|
"special_tokens",
|
|
799
|
-
method!(RbTrainer::
|
|
802
|
+
method!(RbTrainer::unigram_trainer_get_special_tokens, 0),
|
|
800
803
|
)?;
|
|
801
804
|
class.define_method(
|
|
802
805
|
"special_tokens=",
|
|
@@ -804,7 +807,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
804
807
|
)?;
|
|
805
808
|
class.define_method(
|
|
806
809
|
"initial_alphabet",
|
|
807
|
-
method!(RbTrainer::
|
|
810
|
+
method!(RbTrainer::unigram_trainer_get_initial_alphabet, 0),
|
|
808
811
|
)?;
|
|
809
812
|
class.define_method(
|
|
810
813
|
"initial_alphabet=",
|
|
@@ -815,7 +818,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
815
818
|
class.define_singleton_method("_new", function!(RbWordLevelTrainer::new, 1))?;
|
|
816
819
|
class.define_method(
|
|
817
820
|
"vocab_size",
|
|
818
|
-
method!(RbTrainer::
|
|
821
|
+
method!(RbTrainer::word_level_trainer_get_vocab_size, 0),
|
|
819
822
|
)?;
|
|
820
823
|
class.define_method(
|
|
821
824
|
"vocab_size=",
|
|
@@ -823,7 +826,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
823
826
|
)?;
|
|
824
827
|
class.define_method(
|
|
825
828
|
"min_frequency",
|
|
826
|
-
method!(RbTrainer::
|
|
829
|
+
method!(RbTrainer::word_level_trainer_get_min_frequency, 0),
|
|
827
830
|
)?;
|
|
828
831
|
class.define_method(
|
|
829
832
|
"min_frequency=",
|
|
@@ -831,7 +834,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
831
834
|
)?;
|
|
832
835
|
class.define_method(
|
|
833
836
|
"show_progress",
|
|
834
|
-
method!(RbTrainer::
|
|
837
|
+
method!(RbTrainer::word_level_trainer_get_show_progress, 0),
|
|
835
838
|
)?;
|
|
836
839
|
class.define_method(
|
|
837
840
|
"show_progress=",
|
|
@@ -839,7 +842,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
839
842
|
)?;
|
|
840
843
|
class.define_method(
|
|
841
844
|
"special_tokens",
|
|
842
|
-
method!(RbTrainer::
|
|
845
|
+
method!(RbTrainer::word_level_trainer_get_special_tokens, 0),
|
|
843
846
|
)?;
|
|
844
847
|
class.define_method(
|
|
845
848
|
"special_tokens=",
|
|
@@ -850,7 +853,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
850
853
|
class.define_singleton_method("_new", function!(RbWordPieceTrainer::new, 1))?;
|
|
851
854
|
class.define_method(
|
|
852
855
|
"vocab_size",
|
|
853
|
-
method!(RbTrainer::
|
|
856
|
+
method!(RbTrainer::word_piece_trainer_get_vocab_size, 0),
|
|
854
857
|
)?;
|
|
855
858
|
class.define_method(
|
|
856
859
|
"vocab_size=",
|
|
@@ -858,7 +861,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
858
861
|
)?;
|
|
859
862
|
class.define_method(
|
|
860
863
|
"min_frequency",
|
|
861
|
-
method!(RbTrainer::
|
|
864
|
+
method!(RbTrainer::word_piece_trainer_get_min_frequency, 0),
|
|
862
865
|
)?;
|
|
863
866
|
class.define_method(
|
|
864
867
|
"min_frequency=",
|
|
@@ -866,7 +869,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
866
869
|
)?;
|
|
867
870
|
class.define_method(
|
|
868
871
|
"show_progress",
|
|
869
|
-
method!(RbTrainer::
|
|
872
|
+
method!(RbTrainer::word_piece_trainer_get_show_progress, 0),
|
|
870
873
|
)?;
|
|
871
874
|
class.define_method(
|
|
872
875
|
"show_progress=",
|
|
@@ -874,7 +877,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
874
877
|
)?;
|
|
875
878
|
class.define_method(
|
|
876
879
|
"special_tokens",
|
|
877
|
-
method!(RbTrainer::
|
|
880
|
+
method!(RbTrainer::word_piece_trainer_get_special_tokens, 0),
|
|
878
881
|
)?;
|
|
879
882
|
class.define_method(
|
|
880
883
|
"special_tokens=",
|
|
@@ -882,7 +885,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
882
885
|
)?;
|
|
883
886
|
class.define_method(
|
|
884
887
|
"limit_alphabet",
|
|
885
|
-
method!(RbTrainer::
|
|
888
|
+
method!(RbTrainer::word_piece_trainer_get_limit_alphabet, 0),
|
|
886
889
|
)?;
|
|
887
890
|
class.define_method(
|
|
888
891
|
"limit_alphabet=",
|
|
@@ -890,7 +893,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
890
893
|
)?;
|
|
891
894
|
class.define_method(
|
|
892
895
|
"initial_alphabet",
|
|
893
|
-
method!(RbTrainer::
|
|
896
|
+
method!(RbTrainer::word_piece_trainer_get_initial_alphabet, 0),
|
|
894
897
|
)?;
|
|
895
898
|
class.define_method(
|
|
896
899
|
"initial_alphabet=",
|
|
@@ -898,7 +901,10 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
898
901
|
)?;
|
|
899
902
|
class.define_method(
|
|
900
903
|
"continuing_subword_prefix",
|
|
901
|
-
method!(
|
|
904
|
+
method!(
|
|
905
|
+
RbTrainer::word_piece_trainer_get_continuing_subword_prefix,
|
|
906
|
+
0
|
|
907
|
+
),
|
|
902
908
|
)?;
|
|
903
909
|
class.define_method(
|
|
904
910
|
"continuing_subword_prefix=",
|
|
@@ -909,7 +915,7 @@ pub fn init_trainers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
909
915
|
)?;
|
|
910
916
|
class.define_method(
|
|
911
917
|
"end_of_word_suffix",
|
|
912
|
-
method!(RbTrainer::
|
|
918
|
+
method!(RbTrainer::word_piece_trainer_get_end_of_word_suffix, 0),
|
|
913
919
|
)?;
|
|
914
920
|
class.define_method(
|
|
915
921
|
"end_of_word_suffix=",
|
|
@@ -13,7 +13,8 @@ pub enum RbPattern<'p> {
|
|
|
13
13
|
|
|
14
14
|
impl TryConvert for RbPattern<'_> {
|
|
15
15
|
fn try_convert(obj: Value) -> RbResult<Self> {
|
|
16
|
-
|
|
16
|
+
let ruby = &Ruby::get_with(obj);
|
|
17
|
+
if obj.is_kind_of(regex(ruby)) {
|
|
17
18
|
Ok(RbPattern::Regex(TryConvert::try_convert(obj)?))
|
|
18
19
|
} else {
|
|
19
20
|
Ok(RbPattern::Str(TryConvert::try_convert(obj)?))
|
|
@@ -22,6 +22,6 @@ impl RbRegex {
|
|
|
22
22
|
static REGEX: Lazy<RClass> =
|
|
23
23
|
Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Regex").unwrap());
|
|
24
24
|
|
|
25
|
-
pub fn regex() -> RClass {
|
|
26
|
-
|
|
25
|
+
pub fn regex(ruby: &Ruby) -> RClass {
|
|
26
|
+
ruby.get_inner(®EX)
|
|
27
27
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module Tokenizers
|
|
2
2
|
module FromPretrained
|
|
3
3
|
# for user agent
|
|
4
|
-
TOKENIZERS_VERSION = "0.
|
|
4
|
+
TOKENIZERS_VERSION = "0.23.1"
|
|
5
5
|
|
|
6
6
|
# use Ruby for downloads
|
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
|
@@ -55,7 +55,7 @@ module Tokenizers
|
|
|
55
55
|
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
|
56
56
|
if File.exist?(resource_path)
|
|
57
57
|
res = head_request(url, headers, options)
|
|
58
|
-
if res["etag"] == etag
|
|
58
|
+
if normalize_etag(res["etag"]) == normalize_etag(etag)
|
|
59
59
|
return resource_path
|
|
60
60
|
end
|
|
61
61
|
end
|
|
@@ -106,6 +106,10 @@ module Tokenizers
|
|
|
106
106
|
res
|
|
107
107
|
end
|
|
108
108
|
|
|
109
|
+
def normalize_etag(etag)
|
|
110
|
+
etag.delete_prefix("W/") if etag
|
|
111
|
+
end
|
|
112
|
+
|
|
109
113
|
def cache_dir
|
|
110
114
|
cache_dir =
|
|
111
115
|
if ENV["TOKENIZERS_CACHE"]
|
data/lib/tokenizers/tokenizer.rb
CHANGED
|
@@ -18,6 +18,10 @@ module Tokenizers
|
|
|
18
18
|
_encode_batch(input, is_pretokenized, add_special_tokens)
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
+
def encode_batch_fast(input, is_pretokenized: false, add_special_tokens: true)
|
|
22
|
+
_encode_batch_fast(input, is_pretokenized, add_special_tokens)
|
|
23
|
+
end
|
|
24
|
+
|
|
21
25
|
def decode(ids, skip_special_tokens: true)
|
|
22
26
|
_decode(ids, skip_special_tokens)
|
|
23
27
|
end
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
@@ -44,6 +44,7 @@ files:
|
|
|
44
44
|
- ext/tokenizers/src/normalizers.rs
|
|
45
45
|
- ext/tokenizers/src/pre_tokenizers.rs
|
|
46
46
|
- ext/tokenizers/src/processors.rs
|
|
47
|
+
- ext/tokenizers/src/ruby.rs
|
|
47
48
|
- ext/tokenizers/src/tokenizer.rs
|
|
48
49
|
- ext/tokenizers/src/trainers.rs
|
|
49
50
|
- ext/tokenizers/src/utils/mod.rs
|
|
@@ -73,6 +74,7 @@ files:
|
|
|
73
74
|
- lib/tokenizers/pre_tokenizers/split.rb
|
|
74
75
|
- lib/tokenizers/processors/byte_level.rb
|
|
75
76
|
- lib/tokenizers/processors/roberta_processing.rb
|
|
77
|
+
- lib/tokenizers/processors/sequence.rb
|
|
76
78
|
- lib/tokenizers/processors/template_processing.rb
|
|
77
79
|
- lib/tokenizers/tokenizer.rb
|
|
78
80
|
- lib/tokenizers/trainers/bpe_trainer.rb
|
|
@@ -91,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
91
93
|
requirements:
|
|
92
94
|
- - ">="
|
|
93
95
|
- !ruby/object:Gem::Version
|
|
94
|
-
version: '3.
|
|
96
|
+
version: '3.3'
|
|
95
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
98
|
requirements:
|
|
97
99
|
- - ">="
|
|
98
100
|
- !ruby/object:Gem::Version
|
|
99
101
|
version: '0'
|
|
100
102
|
requirements: []
|
|
101
|
-
rubygems_version: 4.0.
|
|
103
|
+
rubygems_version: 4.0.6
|
|
102
104
|
specification_version: 4
|
|
103
105
|
summary: Fast state-of-the-art tokenizers for Ruby
|
|
104
106
|
test_files: []
|