tokenizers 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
4
- data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
3
+ metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
4
+ data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
5
5
  SHA512:
6
- metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
7
- data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
6
+ metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
7
+ data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.5.0 (2024-05-21)
2
+
3
+ - Updated Tokenizers to 0.19.1
4
+ - Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
5
+ - Dropped support for Ruby < 3.1
6
+
1
7
  ## 0.4.4 (2024-02-27)
2
8
 
3
9
  - Updated Tokenizers to 0.15.2
data/Cargo.lock CHANGED
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "regex",
41
41
  "rustc-hash",
42
42
  "shlex",
43
- "syn 2.0.38",
43
+ "syn",
44
44
  ]
45
45
 
46
46
  [[package]]
@@ -135,9 +135,9 @@ dependencies = [
135
135
 
136
136
  [[package]]
137
137
  name = "darling"
138
- version = "0.14.4"
138
+ version = "0.20.8"
139
139
  source = "registry+https://github.com/rust-lang/crates.io-index"
140
- checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
140
+ checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
141
141
  dependencies = [
142
142
  "darling_core",
143
143
  "darling_macro",
@@ -145,58 +145,58 @@ dependencies = [
145
145
 
146
146
  [[package]]
147
147
  name = "darling_core"
148
- version = "0.14.4"
148
+ version = "0.20.8"
149
149
  source = "registry+https://github.com/rust-lang/crates.io-index"
150
- checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
150
+ checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
151
151
  dependencies = [
152
152
  "fnv",
153
153
  "ident_case",
154
154
  "proc-macro2",
155
155
  "quote",
156
156
  "strsim",
157
- "syn 1.0.109",
157
+ "syn",
158
158
  ]
159
159
 
160
160
  [[package]]
161
161
  name = "darling_macro"
162
- version = "0.14.4"
162
+ version = "0.20.8"
163
163
  source = "registry+https://github.com/rust-lang/crates.io-index"
164
- checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
164
+ checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
165
165
  dependencies = [
166
166
  "darling_core",
167
167
  "quote",
168
- "syn 1.0.109",
168
+ "syn",
169
169
  ]
170
170
 
171
171
  [[package]]
172
172
  name = "derive_builder"
173
- version = "0.12.0"
173
+ version = "0.20.0"
174
174
  source = "registry+https://github.com/rust-lang/crates.io-index"
175
- checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8"
175
+ checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
176
176
  dependencies = [
177
177
  "derive_builder_macro",
178
178
  ]
179
179
 
180
180
  [[package]]
181
181
  name = "derive_builder_core"
182
- version = "0.12.0"
182
+ version = "0.20.0"
183
183
  source = "registry+https://github.com/rust-lang/crates.io-index"
184
- checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f"
184
+ checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
185
185
  dependencies = [
186
186
  "darling",
187
187
  "proc-macro2",
188
188
  "quote",
189
- "syn 1.0.109",
189
+ "syn",
190
190
  ]
191
191
 
192
192
  [[package]]
193
193
  name = "derive_builder_macro"
194
- version = "0.12.0"
194
+ version = "0.20.0"
195
195
  source = "registry+https://github.com/rust-lang/crates.io-index"
196
- checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
196
+ checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
197
197
  dependencies = [
198
198
  "derive_builder_core",
199
- "syn 1.0.109",
199
+ "syn",
200
200
  ]
201
201
 
202
202
  [[package]]
@@ -350,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
350
350
 
351
351
  [[package]]
352
352
  name = "magnus"
353
- version = "0.6.2"
353
+ version = "0.6.4"
354
354
  source = "registry+https://github.com/rust-lang/crates.io-index"
355
- checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
355
+ checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
356
356
  dependencies = [
357
357
  "magnus-macros",
358
358
  "rb-sys",
@@ -368,7 +368,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
368
368
  dependencies = [
369
369
  "proc-macro2",
370
370
  "quote",
371
- "syn 2.0.38",
371
+ "syn",
372
372
  ]
373
373
 
374
374
  [[package]]
@@ -394,9 +394,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
394
394
 
395
395
  [[package]]
396
396
  name = "monostate"
397
- version = "0.1.9"
397
+ version = "0.1.12"
398
398
  source = "registry+https://github.com/rust-lang/crates.io-index"
399
- checksum = "15f370ae88093ec6b11a710dec51321a61d420fafd1bad6e30d01bd9c920e8ee"
399
+ checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
400
400
  dependencies = [
401
401
  "monostate-impl",
402
402
  "serde",
@@ -404,13 +404,13 @@ dependencies = [
404
404
 
405
405
  [[package]]
406
406
  name = "monostate-impl"
407
- version = "0.1.9"
407
+ version = "0.1.12"
408
408
  source = "registry+https://github.com/rust-lang/crates.io-index"
409
- checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce"
409
+ checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
410
410
  dependencies = [
411
411
  "proc-macro2",
412
412
  "quote",
413
- "syn 2.0.38",
413
+ "syn",
414
414
  ]
415
415
 
416
416
  [[package]]
@@ -489,18 +489,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
489
489
 
490
490
  [[package]]
491
491
  name = "proc-macro2"
492
- version = "1.0.68"
492
+ version = "1.0.81"
493
493
  source = "registry+https://github.com/rust-lang/crates.io-index"
494
- checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c"
494
+ checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
495
495
  dependencies = [
496
496
  "unicode-ident",
497
497
  ]
498
498
 
499
499
  [[package]]
500
500
  name = "quote"
501
- version = "1.0.33"
501
+ version = "1.0.36"
502
502
  source = "registry+https://github.com/rust-lang/crates.io-index"
503
- checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
503
+ checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
504
504
  dependencies = [
505
505
  "proc-macro2",
506
506
  ]
@@ -537,9 +537,9 @@ dependencies = [
537
537
 
538
538
  [[package]]
539
539
  name = "rayon"
540
- version = "1.8.0"
540
+ version = "1.10.0"
541
541
  source = "registry+https://github.com/rust-lang/crates.io-index"
542
- checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1"
542
+ checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
543
543
  dependencies = [
544
544
  "either",
545
545
  "rayon-core",
@@ -558,9 +558,9 @@ dependencies = [
558
558
 
559
559
  [[package]]
560
560
  name = "rayon-core"
561
- version = "1.12.0"
561
+ version = "1.12.1"
562
562
  source = "registry+https://github.com/rust-lang/crates.io-index"
563
- checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed"
563
+ checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
564
564
  dependencies = [
565
565
  "crossbeam-deque",
566
566
  "crossbeam-utils",
@@ -568,18 +568,18 @@ dependencies = [
568
568
 
569
569
  [[package]]
570
570
  name = "rb-sys"
571
- version = "0.9.89"
571
+ version = "0.9.97"
572
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
573
- checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
573
+ checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
574
574
  dependencies = [
575
575
  "rb-sys-build",
576
576
  ]
577
577
 
578
578
  [[package]]
579
579
  name = "rb-sys-build"
580
- version = "0.9.89"
580
+ version = "0.9.97"
581
581
  source = "registry+https://github.com/rust-lang/crates.io-index"
582
- checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
582
+ checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
583
583
  dependencies = [
584
584
  "bindgen",
585
585
  "lazy_static",
@@ -587,7 +587,7 @@ dependencies = [
587
587
  "quote",
588
588
  "regex",
589
589
  "shell-words",
590
- "syn 2.0.38",
590
+ "syn",
591
591
  ]
592
592
 
593
593
  [[package]]
@@ -598,33 +598,27 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
598
598
 
599
599
  [[package]]
600
600
  name = "regex"
601
- version = "1.9.5"
601
+ version = "1.10.4"
602
602
  source = "registry+https://github.com/rust-lang/crates.io-index"
603
- checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
603
+ checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
604
604
  dependencies = [
605
605
  "aho-corasick",
606
606
  "memchr",
607
607
  "regex-automata",
608
- "regex-syntax 0.7.5",
608
+ "regex-syntax",
609
609
  ]
610
610
 
611
611
  [[package]]
612
612
  name = "regex-automata"
613
- version = "0.3.8"
613
+ version = "0.4.6"
614
614
  source = "registry+https://github.com/rust-lang/crates.io-index"
615
- checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
615
+ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
616
616
  dependencies = [
617
617
  "aho-corasick",
618
618
  "memchr",
619
- "regex-syntax 0.7.5",
619
+ "regex-syntax",
620
620
  ]
621
621
 
622
- [[package]]
623
- name = "regex-syntax"
624
- version = "0.7.5"
625
- source = "registry+https://github.com/rust-lang/crates.io-index"
626
- checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
627
-
628
622
  [[package]]
629
623
  name = "regex-syntax"
630
624
  version = "0.8.2"
@@ -672,7 +666,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
672
666
  dependencies = [
673
667
  "proc-macro2",
674
668
  "quote",
675
- "syn 2.0.38",
669
+ "syn",
676
670
  ]
677
671
 
678
672
  [[package]]
@@ -724,20 +718,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
724
718
 
725
719
  [[package]]
726
720
  name = "syn"
727
- version = "1.0.109"
728
- source = "registry+https://github.com/rust-lang/crates.io-index"
729
- checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
730
- dependencies = [
731
- "proc-macro2",
732
- "quote",
733
- "unicode-ident",
734
- ]
735
-
736
- [[package]]
737
- name = "syn"
738
- version = "2.0.38"
721
+ version = "2.0.59"
739
722
  source = "registry+https://github.com/rust-lang/crates.io-index"
740
- checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
723
+ checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
741
724
  dependencies = [
742
725
  "proc-macro2",
743
726
  "quote",
@@ -761,24 +744,24 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
761
744
  dependencies = [
762
745
  "proc-macro2",
763
746
  "quote",
764
- "syn 2.0.38",
747
+ "syn",
765
748
  ]
766
749
 
767
750
  [[package]]
768
751
  name = "tokenizers"
769
- version = "0.4.4"
752
+ version = "0.5.0"
770
753
  dependencies = [
771
754
  "magnus",
772
755
  "onig",
773
756
  "serde",
774
- "tokenizers 0.15.2",
757
+ "tokenizers 0.19.1",
775
758
  ]
776
759
 
777
760
  [[package]]
778
761
  name = "tokenizers"
779
- version = "0.15.2"
762
+ version = "0.19.1"
780
763
  source = "registry+https://github.com/rust-lang/crates.io-index"
781
- checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
764
+ checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
782
765
  dependencies = [
783
766
  "aho-corasick",
784
767
  "derive_builder",
@@ -796,7 +779,7 @@ dependencies = [
796
779
  "rayon",
797
780
  "rayon-cond",
798
781
  "regex",
799
- "regex-syntax 0.8.2",
782
+ "regex-syntax",
800
783
  "serde",
801
784
  "serde_json",
802
785
  "spm_precompiled",
@@ -823,9 +806,9 @@ dependencies = [
823
806
 
824
807
  [[package]]
825
808
  name = "unicode-segmentation"
826
- version = "1.10.1"
809
+ version = "1.11.0"
827
810
  source = "registry+https://github.com/rust-lang/crates.io-index"
828
- checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
811
+ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
829
812
 
830
813
  [[package]]
831
814
  name = "unicode-width"
@@ -1,10 +1,10 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.4"
3
+ version = "0.5.0"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
7
- rust-version = "1.62.0"
7
+ rust-version = "1.63.0"
8
8
  publish = false
9
9
 
10
10
  [lib]
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.2" # also update in from_pretrained.rb
19
+ version = "=0.19.1" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -1,5 +1,6 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
+ use crate::pre_tokenizers::from_string;
3
4
  use magnus::value::Lazy;
4
5
  use magnus::{
5
6
  data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
11
12
  use tk::decoders::byte_level::ByteLevel;
12
13
  use tk::decoders::ctc::CTC;
13
14
  use tk::decoders::fuse::Fuse;
14
- use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::metaspace::{Metaspace, PrependScheme};
15
16
  use tk::decoders::strip::Strip;
16
17
  use tk::decoders::wordpiece::WordPiece;
17
18
  use tk::decoders::DecoderWrapper;
@@ -126,12 +127,29 @@ impl RbDecoder {
126
127
  setter!(self, Metaspace, @set_replacement, replacement);
127
128
  }
128
129
 
129
- pub fn metaspace_add_prefix_space(&self) -> bool {
130
- getter!(self, Metaspace, add_prefix_space)
130
+ pub fn metaspace_split(&self) -> bool {
131
+ getter!(self, Metaspace, get_split())
131
132
  }
132
133
 
133
- pub fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
134
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
134
+ pub fn metaspace_set_split(&self, split: bool) {
135
+ setter!(self, Metaspace, @set_split, split);
136
+ }
137
+
138
+ pub fn metaspace_prepend_scheme(&self) -> String {
139
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
140
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
141
+ match scheme {
142
+ PrependScheme::First => "first",
143
+ PrependScheme::Never => "never",
144
+ PrependScheme::Always => "always",
145
+ }
146
+ .to_string()
147
+ }
148
+
149
+ pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
150
+ let scheme = from_string(prepend_scheme)?;
151
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
152
+ Ok(())
135
153
  }
136
154
 
137
155
  pub fn word_piece_cleanup(&self) -> bool {
@@ -194,8 +212,9 @@ impl RbFuse {
194
212
  pub struct RbMetaspaceDecoder {}
195
213
 
196
214
  impl RbMetaspaceDecoder {
197
- pub fn new(replacement: char, add_prefix_space: bool) -> RbDecoder {
198
- Metaspace::new(replacement, add_prefix_space).into()
215
+ pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
216
+ let prepend_scheme = from_string(prepend_scheme)?;
217
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
199
218
  }
200
219
  }
201
220
 
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
364
383
  class.define_singleton_method("new", function!(RbFuse::new, 0))?;
365
384
 
366
385
  let class = module.define_class("Metaspace", decoder)?;
367
- class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
368
- class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
369
- class.define_method("add_prefix_space=", method!(RbDecoder::metaspace_set_add_prefix_space, 1))?;
386
+ class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
387
+ class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
388
+ class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
370
389
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
371
390
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
391
+ class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
392
+ class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
372
393
 
373
394
  let class = module.define_class("Replace", decoder)?;
374
395
  class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
@@ -1,7 +1,7 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
3
  use magnus::{
4
- data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
4
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
5
5
  RArray, RClass, RModule, Ruby, TryConvert, TypedData,
6
6
  };
7
7
 
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
12
12
  use tk::pre_tokenizers::byte_level::ByteLevel;
13
13
  use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
14
14
  use tk::pre_tokenizers::digits::Digits;
15
- use tk::pre_tokenizers::metaspace::Metaspace;
15
+ use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
16
16
  use tk::pre_tokenizers::punctuation::Punctuation;
17
17
  use tk::pre_tokenizers::split::Split;
18
18
  use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
118
118
  setter!(self, Digits, individual_digits, individual_digits);
119
119
  }
120
120
 
121
- fn metaspace_add_prefix_space(&self) -> bool {
122
- getter!(self, Metaspace, add_prefix_space)
123
- }
124
-
125
- fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
126
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
127
- }
128
-
129
121
  fn metaspace_replacement(&self) -> String {
130
122
  getter!(self, Metaspace, get_replacement().to_string())
131
123
  }
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
133
125
  fn metaspace_set_replacement(&self, replacement: char) {
134
126
  setter!(self, Metaspace, @set_replacement, replacement);
135
127
  }
128
+
129
+ fn metaspace_split(&self) -> bool {
130
+ getter!(self, Metaspace, get_split())
131
+ }
132
+
133
+ fn metaspace_set_split(&self, split: bool) {
134
+ setter!(self, Metaspace, @set_split, split);
135
+ }
136
+
137
+ fn metaspace_prepend_scheme(&self) -> String {
138
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
139
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
140
+ match scheme {
141
+ PrependScheme::First => "first",
142
+ PrependScheme::Never => "never",
143
+ PrependScheme::Always => "always",
144
+ }
145
+ .to_string()
146
+ }
147
+
148
+ fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
149
+ let scheme = from_string(prepend_scheme)?;
150
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
151
+ Ok(())
152
+ }
136
153
  }
137
154
 
138
155
  impl PreTokenizer for RbPreTokenizer {
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
180
197
  impl RbMetaspace {
181
198
  fn new(
182
199
  replacement: char,
183
- add_prefix_space: bool,
184
- ) -> RbPreTokenizer {
185
- Metaspace::new(replacement, add_prefix_space).into()
200
+ prepend_scheme: String,
201
+ split: bool,
202
+ ) -> RbResult<RbPreTokenizer> {
203
+ let prepend_scheme = from_string(prepend_scheme)?;
204
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
186
205
  }
187
206
  }
188
207
 
@@ -252,6 +271,21 @@ impl RbSequence {
252
271
  }
253
272
  }
254
273
 
274
+ pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
275
+ let scheme = match string.as_str() {
276
+ "first" => PrependScheme::First,
277
+ "never" => PrependScheme::Never,
278
+ "always" => PrependScheme::Always,
279
+ _ => {
280
+ return Err(Error::new(exception::arg_error(), format!(
281
+ "{} is an unknown variant, should be one of ['first', 'never', 'always']",
282
+ string
283
+ )));
284
+ }
285
+ };
286
+ Ok(scheme)
287
+ }
288
+
255
289
  #[derive(Clone, Deserialize)]
256
290
  #[serde(untagged)]
257
291
  pub(crate) enum RbPreTokenizerWrapper {
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
465
499
  class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
466
500
 
467
501
  let class = module.define_class("Metaspace", pre_tokenizer)?;
468
- class.define_singleton_method("_new", function!(RbMetaspace::new, 2))?;
469
- class.define_method("add_prefix_space", method!(RbPreTokenizer::metaspace_add_prefix_space, 0))?;
470
- class.define_method("add_prefix_space=", method!(RbPreTokenizer::metaspace_set_add_prefix_space, 1))?;
502
+ class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
503
+ class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
504
+ class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
471
505
  class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
472
506
  class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
507
+ class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
508
+ class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
473
509
 
474
510
  let class = module.define_class("Punctuation", pre_tokenizer)?;
475
511
  class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Decoders
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.2"
4
+ TOKENIZERS_VERSION = "0.19.1"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module PreTokenizers
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.4"
2
+ VERSION = "0.5.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-27 00:00:00.000000000 Z
11
+ date: 2024-05-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: '3'
96
+ version: '3.1'
97
97
  required_rubygems_version: !ruby/object:Gem::Requirement
98
98
  requirements:
99
99
  - - ">="
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.5.3
103
+ rubygems_version: 3.5.9
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby