tokenizers 0.4.4 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
4
- data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
3
+ metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
4
+ data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
5
5
  SHA512:
6
- metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
7
- data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
6
+ metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
7
+ data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.5.0 (2024-05-21)
2
+
3
+ - Updated Tokenizers to 0.19.1
4
+ - Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
5
+ - Dropped support for Ruby < 3.1
6
+
1
7
  ## 0.4.4 (2024-02-27)
2
8
 
3
9
  - Updated Tokenizers to 0.15.2
data/Cargo.lock CHANGED
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "regex",
41
41
  "rustc-hash",
42
42
  "shlex",
43
- "syn 2.0.38",
43
+ "syn",
44
44
  ]
45
45
 
46
46
  [[package]]
@@ -135,9 +135,9 @@ dependencies = [
135
135
 
136
136
  [[package]]
137
137
  name = "darling"
138
- version = "0.14.4"
138
+ version = "0.20.8"
139
139
  source = "registry+https://github.com/rust-lang/crates.io-index"
140
- checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
140
+ checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
141
141
  dependencies = [
142
142
  "darling_core",
143
143
  "darling_macro",
@@ -145,58 +145,58 @@ dependencies = [
145
145
 
146
146
  [[package]]
147
147
  name = "darling_core"
148
- version = "0.14.4"
148
+ version = "0.20.8"
149
149
  source = "registry+https://github.com/rust-lang/crates.io-index"
150
- checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
150
+ checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
151
151
  dependencies = [
152
152
  "fnv",
153
153
  "ident_case",
154
154
  "proc-macro2",
155
155
  "quote",
156
156
  "strsim",
157
- "syn 1.0.109",
157
+ "syn",
158
158
  ]
159
159
 
160
160
  [[package]]
161
161
  name = "darling_macro"
162
- version = "0.14.4"
162
+ version = "0.20.8"
163
163
  source = "registry+https://github.com/rust-lang/crates.io-index"
164
- checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
164
+ checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
165
165
  dependencies = [
166
166
  "darling_core",
167
167
  "quote",
168
- "syn 1.0.109",
168
+ "syn",
169
169
  ]
170
170
 
171
171
  [[package]]
172
172
  name = "derive_builder"
173
- version = "0.12.0"
173
+ version = "0.20.0"
174
174
  source = "registry+https://github.com/rust-lang/crates.io-index"
175
- checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8"
175
+ checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
176
176
  dependencies = [
177
177
  "derive_builder_macro",
178
178
  ]
179
179
 
180
180
  [[package]]
181
181
  name = "derive_builder_core"
182
- version = "0.12.0"
182
+ version = "0.20.0"
183
183
  source = "registry+https://github.com/rust-lang/crates.io-index"
184
- checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f"
184
+ checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
185
185
  dependencies = [
186
186
  "darling",
187
187
  "proc-macro2",
188
188
  "quote",
189
- "syn 1.0.109",
189
+ "syn",
190
190
  ]
191
191
 
192
192
  [[package]]
193
193
  name = "derive_builder_macro"
194
- version = "0.12.0"
194
+ version = "0.20.0"
195
195
  source = "registry+https://github.com/rust-lang/crates.io-index"
196
- checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
196
+ checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
197
197
  dependencies = [
198
198
  "derive_builder_core",
199
- "syn 1.0.109",
199
+ "syn",
200
200
  ]
201
201
 
202
202
  [[package]]
@@ -350,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
350
350
 
351
351
  [[package]]
352
352
  name = "magnus"
353
- version = "0.6.2"
353
+ version = "0.6.4"
354
354
  source = "registry+https://github.com/rust-lang/crates.io-index"
355
- checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
355
+ checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
356
356
  dependencies = [
357
357
  "magnus-macros",
358
358
  "rb-sys",
@@ -368,7 +368,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
368
368
  dependencies = [
369
369
  "proc-macro2",
370
370
  "quote",
371
- "syn 2.0.38",
371
+ "syn",
372
372
  ]
373
373
 
374
374
  [[package]]
@@ -394,9 +394,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
394
394
 
395
395
  [[package]]
396
396
  name = "monostate"
397
- version = "0.1.9"
397
+ version = "0.1.12"
398
398
  source = "registry+https://github.com/rust-lang/crates.io-index"
399
- checksum = "15f370ae88093ec6b11a710dec51321a61d420fafd1bad6e30d01bd9c920e8ee"
399
+ checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
400
400
  dependencies = [
401
401
  "monostate-impl",
402
402
  "serde",
@@ -404,13 +404,13 @@ dependencies = [
404
404
 
405
405
  [[package]]
406
406
  name = "monostate-impl"
407
- version = "0.1.9"
407
+ version = "0.1.12"
408
408
  source = "registry+https://github.com/rust-lang/crates.io-index"
409
- checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce"
409
+ checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
410
410
  dependencies = [
411
411
  "proc-macro2",
412
412
  "quote",
413
- "syn 2.0.38",
413
+ "syn",
414
414
  ]
415
415
 
416
416
  [[package]]
@@ -489,18 +489,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
489
489
 
490
490
  [[package]]
491
491
  name = "proc-macro2"
492
- version = "1.0.68"
492
+ version = "1.0.81"
493
493
  source = "registry+https://github.com/rust-lang/crates.io-index"
494
- checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c"
494
+ checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
495
495
  dependencies = [
496
496
  "unicode-ident",
497
497
  ]
498
498
 
499
499
  [[package]]
500
500
  name = "quote"
501
- version = "1.0.33"
501
+ version = "1.0.36"
502
502
  source = "registry+https://github.com/rust-lang/crates.io-index"
503
- checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
503
+ checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
504
504
  dependencies = [
505
505
  "proc-macro2",
506
506
  ]
@@ -537,9 +537,9 @@ dependencies = [
537
537
 
538
538
  [[package]]
539
539
  name = "rayon"
540
- version = "1.8.0"
540
+ version = "1.10.0"
541
541
  source = "registry+https://github.com/rust-lang/crates.io-index"
542
- checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1"
542
+ checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
543
543
  dependencies = [
544
544
  "either",
545
545
  "rayon-core",
@@ -558,9 +558,9 @@ dependencies = [
558
558
 
559
559
  [[package]]
560
560
  name = "rayon-core"
561
- version = "1.12.0"
561
+ version = "1.12.1"
562
562
  source = "registry+https://github.com/rust-lang/crates.io-index"
563
- checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed"
563
+ checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
564
564
  dependencies = [
565
565
  "crossbeam-deque",
566
566
  "crossbeam-utils",
@@ -568,18 +568,18 @@ dependencies = [
568
568
 
569
569
  [[package]]
570
570
  name = "rb-sys"
571
- version = "0.9.89"
571
+ version = "0.9.97"
572
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
573
- checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
573
+ checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
574
574
  dependencies = [
575
575
  "rb-sys-build",
576
576
  ]
577
577
 
578
578
  [[package]]
579
579
  name = "rb-sys-build"
580
- version = "0.9.89"
580
+ version = "0.9.97"
581
581
  source = "registry+https://github.com/rust-lang/crates.io-index"
582
- checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
582
+ checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
583
583
  dependencies = [
584
584
  "bindgen",
585
585
  "lazy_static",
@@ -587,7 +587,7 @@ dependencies = [
587
587
  "quote",
588
588
  "regex",
589
589
  "shell-words",
590
- "syn 2.0.38",
590
+ "syn",
591
591
  ]
592
592
 
593
593
  [[package]]
@@ -598,33 +598,27 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
598
598
 
599
599
  [[package]]
600
600
  name = "regex"
601
- version = "1.9.5"
601
+ version = "1.10.4"
602
602
  source = "registry+https://github.com/rust-lang/crates.io-index"
603
- checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
603
+ checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
604
604
  dependencies = [
605
605
  "aho-corasick",
606
606
  "memchr",
607
607
  "regex-automata",
608
- "regex-syntax 0.7.5",
608
+ "regex-syntax",
609
609
  ]
610
610
 
611
611
  [[package]]
612
612
  name = "regex-automata"
613
- version = "0.3.8"
613
+ version = "0.4.6"
614
614
  source = "registry+https://github.com/rust-lang/crates.io-index"
615
- checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
615
+ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
616
616
  dependencies = [
617
617
  "aho-corasick",
618
618
  "memchr",
619
- "regex-syntax 0.7.5",
619
+ "regex-syntax",
620
620
  ]
621
621
 
622
- [[package]]
623
- name = "regex-syntax"
624
- version = "0.7.5"
625
- source = "registry+https://github.com/rust-lang/crates.io-index"
626
- checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
627
-
628
622
  [[package]]
629
623
  name = "regex-syntax"
630
624
  version = "0.8.2"
@@ -672,7 +666,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
672
666
  dependencies = [
673
667
  "proc-macro2",
674
668
  "quote",
675
- "syn 2.0.38",
669
+ "syn",
676
670
  ]
677
671
 
678
672
  [[package]]
@@ -724,20 +718,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
724
718
 
725
719
  [[package]]
726
720
  name = "syn"
727
- version = "1.0.109"
728
- source = "registry+https://github.com/rust-lang/crates.io-index"
729
- checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
730
- dependencies = [
731
- "proc-macro2",
732
- "quote",
733
- "unicode-ident",
734
- ]
735
-
736
- [[package]]
737
- name = "syn"
738
- version = "2.0.38"
721
+ version = "2.0.59"
739
722
  source = "registry+https://github.com/rust-lang/crates.io-index"
740
- checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
723
+ checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
741
724
  dependencies = [
742
725
  "proc-macro2",
743
726
  "quote",
@@ -761,24 +744,24 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
761
744
  dependencies = [
762
745
  "proc-macro2",
763
746
  "quote",
764
- "syn 2.0.38",
747
+ "syn",
765
748
  ]
766
749
 
767
750
  [[package]]
768
751
  name = "tokenizers"
769
- version = "0.4.4"
752
+ version = "0.5.0"
770
753
  dependencies = [
771
754
  "magnus",
772
755
  "onig",
773
756
  "serde",
774
- "tokenizers 0.15.2",
757
+ "tokenizers 0.19.1",
775
758
  ]
776
759
 
777
760
  [[package]]
778
761
  name = "tokenizers"
779
- version = "0.15.2"
762
+ version = "0.19.1"
780
763
  source = "registry+https://github.com/rust-lang/crates.io-index"
781
- checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
764
+ checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
782
765
  dependencies = [
783
766
  "aho-corasick",
784
767
  "derive_builder",
@@ -796,7 +779,7 @@ dependencies = [
796
779
  "rayon",
797
780
  "rayon-cond",
798
781
  "regex",
799
- "regex-syntax 0.8.2",
782
+ "regex-syntax",
800
783
  "serde",
801
784
  "serde_json",
802
785
  "spm_precompiled",
@@ -823,9 +806,9 @@ dependencies = [
823
806
 
824
807
  [[package]]
825
808
  name = "unicode-segmentation"
826
- version = "1.10.1"
809
+ version = "1.11.0"
827
810
  source = "registry+https://github.com/rust-lang/crates.io-index"
828
- checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
811
+ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
829
812
 
830
813
  [[package]]
831
814
  name = "unicode-width"
@@ -1,10 +1,10 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.4"
3
+ version = "0.5.0"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
7
- rust-version = "1.62.0"
7
+ rust-version = "1.63.0"
8
8
  publish = false
9
9
 
10
10
  [lib]
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.2" # also update in from_pretrained.rb
19
+ version = "=0.19.1" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -1,5 +1,6 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
+ use crate::pre_tokenizers::from_string;
3
4
  use magnus::value::Lazy;
4
5
  use magnus::{
5
6
  data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
11
12
  use tk::decoders::byte_level::ByteLevel;
12
13
  use tk::decoders::ctc::CTC;
13
14
  use tk::decoders::fuse::Fuse;
14
- use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::metaspace::{Metaspace, PrependScheme};
15
16
  use tk::decoders::strip::Strip;
16
17
  use tk::decoders::wordpiece::WordPiece;
17
18
  use tk::decoders::DecoderWrapper;
@@ -126,12 +127,29 @@ impl RbDecoder {
126
127
  setter!(self, Metaspace, @set_replacement, replacement);
127
128
  }
128
129
 
129
- pub fn metaspace_add_prefix_space(&self) -> bool {
130
- getter!(self, Metaspace, add_prefix_space)
130
+ pub fn metaspace_split(&self) -> bool {
131
+ getter!(self, Metaspace, get_split())
131
132
  }
132
133
 
133
- pub fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
134
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
134
+ pub fn metaspace_set_split(&self, split: bool) {
135
+ setter!(self, Metaspace, @set_split, split);
136
+ }
137
+
138
+ pub fn metaspace_prepend_scheme(&self) -> String {
139
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
140
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
141
+ match scheme {
142
+ PrependScheme::First => "first",
143
+ PrependScheme::Never => "never",
144
+ PrependScheme::Always => "always",
145
+ }
146
+ .to_string()
147
+ }
148
+
149
+ pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
150
+ let scheme = from_string(prepend_scheme)?;
151
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
152
+ Ok(())
135
153
  }
136
154
 
137
155
  pub fn word_piece_cleanup(&self) -> bool {
@@ -194,8 +212,9 @@ impl RbFuse {
194
212
  pub struct RbMetaspaceDecoder {}
195
213
 
196
214
  impl RbMetaspaceDecoder {
197
- pub fn new(replacement: char, add_prefix_space: bool) -> RbDecoder {
198
- Metaspace::new(replacement, add_prefix_space).into()
215
+ pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
216
+ let prepend_scheme = from_string(prepend_scheme)?;
217
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
199
218
  }
200
219
  }
201
220
 
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
364
383
  class.define_singleton_method("new", function!(RbFuse::new, 0))?;
365
384
 
366
385
  let class = module.define_class("Metaspace", decoder)?;
367
- class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
368
- class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
369
- class.define_method("add_prefix_space=", method!(RbDecoder::metaspace_set_add_prefix_space, 1))?;
386
+ class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
387
+ class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
388
+ class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
370
389
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
371
390
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
391
+ class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
392
+ class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
372
393
 
373
394
  let class = module.define_class("Replace", decoder)?;
374
395
  class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
@@ -1,7 +1,7 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
3
  use magnus::{
4
- data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
4
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
5
5
  RArray, RClass, RModule, Ruby, TryConvert, TypedData,
6
6
  };
7
7
 
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
12
12
  use tk::pre_tokenizers::byte_level::ByteLevel;
13
13
  use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
14
14
  use tk::pre_tokenizers::digits::Digits;
15
- use tk::pre_tokenizers::metaspace::Metaspace;
15
+ use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
16
16
  use tk::pre_tokenizers::punctuation::Punctuation;
17
17
  use tk::pre_tokenizers::split::Split;
18
18
  use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
118
118
  setter!(self, Digits, individual_digits, individual_digits);
119
119
  }
120
120
 
121
- fn metaspace_add_prefix_space(&self) -> bool {
122
- getter!(self, Metaspace, add_prefix_space)
123
- }
124
-
125
- fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
126
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
127
- }
128
-
129
121
  fn metaspace_replacement(&self) -> String {
130
122
  getter!(self, Metaspace, get_replacement().to_string())
131
123
  }
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
133
125
  fn metaspace_set_replacement(&self, replacement: char) {
134
126
  setter!(self, Metaspace, @set_replacement, replacement);
135
127
  }
128
+
129
+ fn metaspace_split(&self) -> bool {
130
+ getter!(self, Metaspace, get_split())
131
+ }
132
+
133
+ fn metaspace_set_split(&self, split: bool) {
134
+ setter!(self, Metaspace, @set_split, split);
135
+ }
136
+
137
+ fn metaspace_prepend_scheme(&self) -> String {
138
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
139
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
140
+ match scheme {
141
+ PrependScheme::First => "first",
142
+ PrependScheme::Never => "never",
143
+ PrependScheme::Always => "always",
144
+ }
145
+ .to_string()
146
+ }
147
+
148
+ fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
149
+ let scheme = from_string(prepend_scheme)?;
150
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
151
+ Ok(())
152
+ }
136
153
  }
137
154
 
138
155
  impl PreTokenizer for RbPreTokenizer {
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
180
197
  impl RbMetaspace {
181
198
  fn new(
182
199
  replacement: char,
183
- add_prefix_space: bool,
184
- ) -> RbPreTokenizer {
185
- Metaspace::new(replacement, add_prefix_space).into()
200
+ prepend_scheme: String,
201
+ split: bool,
202
+ ) -> RbResult<RbPreTokenizer> {
203
+ let prepend_scheme = from_string(prepend_scheme)?;
204
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
186
205
  }
187
206
  }
188
207
 
@@ -252,6 +271,21 @@ impl RbSequence {
252
271
  }
253
272
  }
254
273
 
274
+ pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
275
+ let scheme = match string.as_str() {
276
+ "first" => PrependScheme::First,
277
+ "never" => PrependScheme::Never,
278
+ "always" => PrependScheme::Always,
279
+ _ => {
280
+ return Err(Error::new(exception::arg_error(), format!(
281
+ "{} is an unknown variant, should be one of ['first', 'never', 'always']",
282
+ string
283
+ )));
284
+ }
285
+ };
286
+ Ok(scheme)
287
+ }
288
+
255
289
  #[derive(Clone, Deserialize)]
256
290
  #[serde(untagged)]
257
291
  pub(crate) enum RbPreTokenizerWrapper {
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
465
499
  class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
466
500
 
467
501
  let class = module.define_class("Metaspace", pre_tokenizer)?;
468
- class.define_singleton_method("_new", function!(RbMetaspace::new, 2))?;
469
- class.define_method("add_prefix_space", method!(RbPreTokenizer::metaspace_add_prefix_space, 0))?;
470
- class.define_method("add_prefix_space=", method!(RbPreTokenizer::metaspace_set_add_prefix_space, 1))?;
502
+ class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
503
+ class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
504
+ class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
471
505
  class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
472
506
  class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
507
+ class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
508
+ class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
473
509
 
474
510
  let class = module.define_class("Punctuation", pre_tokenizer)?;
475
511
  class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Decoders
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.2"
4
+ TOKENIZERS_VERSION = "0.19.1"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module PreTokenizers
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.4"
2
+ VERSION = "0.5.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-27 00:00:00.000000000 Z
11
+ date: 2024-05-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: '3'
96
+ version: '3.1'
97
97
  required_rubygems_version: !ruby/object:Gem::Requirement
98
98
  requirements:
99
99
  - - ">="
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.5.3
103
+ rubygems_version: 3.5.9
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby