tokenizers 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
4
- data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
3
+ metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
4
+ data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
5
5
  SHA512:
6
- metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
7
- data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
6
+ metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
7
+ data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## 0.5.0 (2024-05-21)
2
+
3
+ - Updated Tokenizers to 0.19.1
4
+ - Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
5
+ - Dropped support for Ruby < 3.1
6
+
7
+ ## 0.4.4 (2024-02-27)
8
+
9
+ - Updated Tokenizers to 0.15.2
10
+
1
11
  ## 0.4.3 (2024-01-03)
2
12
 
3
13
  - Added support for Ruby 3.3
data/Cargo.lock CHANGED
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "regex",
41
41
  "rustc-hash",
42
42
  "shlex",
43
- "syn 2.0.38",
43
+ "syn",
44
44
  ]
45
45
 
46
46
  [[package]]
@@ -135,9 +135,9 @@ dependencies = [
135
135
 
136
136
  [[package]]
137
137
  name = "darling"
138
- version = "0.14.4"
138
+ version = "0.20.8"
139
139
  source = "registry+https://github.com/rust-lang/crates.io-index"
140
- checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
140
+ checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
141
141
  dependencies = [
142
142
  "darling_core",
143
143
  "darling_macro",
@@ -145,58 +145,58 @@ dependencies = [
145
145
 
146
146
  [[package]]
147
147
  name = "darling_core"
148
- version = "0.14.4"
148
+ version = "0.20.8"
149
149
  source = "registry+https://github.com/rust-lang/crates.io-index"
150
- checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
150
+ checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
151
151
  dependencies = [
152
152
  "fnv",
153
153
  "ident_case",
154
154
  "proc-macro2",
155
155
  "quote",
156
156
  "strsim",
157
- "syn 1.0.109",
157
+ "syn",
158
158
  ]
159
159
 
160
160
  [[package]]
161
161
  name = "darling_macro"
162
- version = "0.14.4"
162
+ version = "0.20.8"
163
163
  source = "registry+https://github.com/rust-lang/crates.io-index"
164
- checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
164
+ checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
165
165
  dependencies = [
166
166
  "darling_core",
167
167
  "quote",
168
- "syn 1.0.109",
168
+ "syn",
169
169
  ]
170
170
 
171
171
  [[package]]
172
172
  name = "derive_builder"
173
- version = "0.12.0"
173
+ version = "0.20.0"
174
174
  source = "registry+https://github.com/rust-lang/crates.io-index"
175
- checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8"
175
+ checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
176
176
  dependencies = [
177
177
  "derive_builder_macro",
178
178
  ]
179
179
 
180
180
  [[package]]
181
181
  name = "derive_builder_core"
182
- version = "0.12.0"
182
+ version = "0.20.0"
183
183
  source = "registry+https://github.com/rust-lang/crates.io-index"
184
- checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f"
184
+ checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
185
185
  dependencies = [
186
186
  "darling",
187
187
  "proc-macro2",
188
188
  "quote",
189
- "syn 1.0.109",
189
+ "syn",
190
190
  ]
191
191
 
192
192
  [[package]]
193
193
  name = "derive_builder_macro"
194
- version = "0.12.0"
194
+ version = "0.20.0"
195
195
  source = "registry+https://github.com/rust-lang/crates.io-index"
196
- checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
196
+ checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
197
197
  dependencies = [
198
198
  "derive_builder_core",
199
- "syn 1.0.109",
199
+ "syn",
200
200
  ]
201
201
 
202
202
  [[package]]
@@ -280,6 +280,15 @@ dependencies = [
280
280
  "either",
281
281
  ]
282
282
 
283
+ [[package]]
284
+ name = "itertools"
285
+ version = "0.12.1"
286
+ source = "registry+https://github.com/rust-lang/crates.io-index"
287
+ checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
288
+ dependencies = [
289
+ "either",
290
+ ]
291
+
283
292
  [[package]]
284
293
  name = "itoa"
285
294
  version = "1.0.6"
@@ -341,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
341
350
 
342
351
  [[package]]
343
352
  name = "magnus"
344
- version = "0.6.2"
353
+ version = "0.6.4"
345
354
  source = "registry+https://github.com/rust-lang/crates.io-index"
346
- checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
355
+ checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
347
356
  dependencies = [
348
357
  "magnus-macros",
349
358
  "rb-sys",
@@ -359,7 +368,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
359
368
  dependencies = [
360
369
  "proc-macro2",
361
370
  "quote",
362
- "syn 2.0.38",
371
+ "syn",
363
372
  ]
364
373
 
365
374
  [[package]]
@@ -385,9 +394,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
385
394
 
386
395
  [[package]]
387
396
  name = "monostate"
388
- version = "0.1.9"
397
+ version = "0.1.12"
389
398
  source = "registry+https://github.com/rust-lang/crates.io-index"
390
- checksum = "15f370ae88093ec6b11a710dec51321a61d420fafd1bad6e30d01bd9c920e8ee"
399
+ checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
391
400
  dependencies = [
392
401
  "monostate-impl",
393
402
  "serde",
@@ -395,13 +404,13 @@ dependencies = [
395
404
 
396
405
  [[package]]
397
406
  name = "monostate-impl"
398
- version = "0.1.9"
407
+ version = "0.1.12"
399
408
  source = "registry+https://github.com/rust-lang/crates.io-index"
400
- checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce"
409
+ checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
401
410
  dependencies = [
402
411
  "proc-macro2",
403
412
  "quote",
404
- "syn 2.0.38",
413
+ "syn",
405
414
  ]
406
415
 
407
416
  [[package]]
@@ -480,18 +489,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
480
489
 
481
490
  [[package]]
482
491
  name = "proc-macro2"
483
- version = "1.0.68"
492
+ version = "1.0.81"
484
493
  source = "registry+https://github.com/rust-lang/crates.io-index"
485
- checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c"
494
+ checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
486
495
  dependencies = [
487
496
  "unicode-ident",
488
497
  ]
489
498
 
490
499
  [[package]]
491
500
  name = "quote"
492
- version = "1.0.33"
501
+ version = "1.0.36"
493
502
  source = "registry+https://github.com/rust-lang/crates.io-index"
494
- checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
503
+ checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
495
504
  dependencies = [
496
505
  "proc-macro2",
497
506
  ]
@@ -528,9 +537,9 @@ dependencies = [
528
537
 
529
538
  [[package]]
530
539
  name = "rayon"
531
- version = "1.8.0"
540
+ version = "1.10.0"
532
541
  source = "registry+https://github.com/rust-lang/crates.io-index"
533
- checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1"
542
+ checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
534
543
  dependencies = [
535
544
  "either",
536
545
  "rayon-core",
@@ -543,15 +552,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
543
552
  checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
544
553
  dependencies = [
545
554
  "either",
546
- "itertools",
555
+ "itertools 0.11.0",
547
556
  "rayon",
548
557
  ]
549
558
 
550
559
  [[package]]
551
560
  name = "rayon-core"
552
- version = "1.12.0"
561
+ version = "1.12.1"
553
562
  source = "registry+https://github.com/rust-lang/crates.io-index"
554
- checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed"
563
+ checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
555
564
  dependencies = [
556
565
  "crossbeam-deque",
557
566
  "crossbeam-utils",
@@ -559,18 +568,18 @@ dependencies = [
559
568
 
560
569
  [[package]]
561
570
  name = "rb-sys"
562
- version = "0.9.86"
571
+ version = "0.9.97"
563
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
564
- checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
573
+ checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
565
574
  dependencies = [
566
575
  "rb-sys-build",
567
576
  ]
568
577
 
569
578
  [[package]]
570
579
  name = "rb-sys-build"
571
- version = "0.9.86"
580
+ version = "0.9.97"
572
581
  source = "registry+https://github.com/rust-lang/crates.io-index"
573
- checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
582
+ checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
574
583
  dependencies = [
575
584
  "bindgen",
576
585
  "lazy_static",
@@ -578,7 +587,7 @@ dependencies = [
578
587
  "quote",
579
588
  "regex",
580
589
  "shell-words",
581
- "syn 2.0.38",
590
+ "syn",
582
591
  ]
583
592
 
584
593
  [[package]]
@@ -589,9 +598,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
589
598
 
590
599
  [[package]]
591
600
  name = "regex"
592
- version = "1.9.5"
601
+ version = "1.10.4"
593
602
  source = "registry+https://github.com/rust-lang/crates.io-index"
594
- checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
603
+ checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
595
604
  dependencies = [
596
605
  "aho-corasick",
597
606
  "memchr",
@@ -601,9 +610,9 @@ dependencies = [
601
610
 
602
611
  [[package]]
603
612
  name = "regex-automata"
604
- version = "0.3.8"
613
+ version = "0.4.6"
605
614
  source = "registry+https://github.com/rust-lang/crates.io-index"
606
- checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
615
+ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
607
616
  dependencies = [
608
617
  "aho-corasick",
609
618
  "memchr",
@@ -612,9 +621,9 @@ dependencies = [
612
621
 
613
622
  [[package]]
614
623
  name = "regex-syntax"
615
- version = "0.7.5"
624
+ version = "0.8.2"
616
625
  source = "registry+https://github.com/rust-lang/crates.io-index"
617
- checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
626
+ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
618
627
 
619
628
  [[package]]
620
629
  name = "rustc-hash"
@@ -657,7 +666,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
657
666
  dependencies = [
658
667
  "proc-macro2",
659
668
  "quote",
660
- "syn 2.0.38",
669
+ "syn",
661
670
  ]
662
671
 
663
672
  [[package]]
@@ -709,20 +718,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
709
718
 
710
719
  [[package]]
711
720
  name = "syn"
712
- version = "1.0.109"
713
- source = "registry+https://github.com/rust-lang/crates.io-index"
714
- checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
715
- dependencies = [
716
- "proc-macro2",
717
- "quote",
718
- "unicode-ident",
719
- ]
720
-
721
- [[package]]
722
- name = "syn"
723
- version = "2.0.38"
721
+ version = "2.0.59"
724
722
  source = "registry+https://github.com/rust-lang/crates.io-index"
725
- checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
723
+ checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
726
724
  dependencies = [
727
725
  "proc-macro2",
728
726
  "quote",
@@ -746,31 +744,31 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
746
744
  dependencies = [
747
745
  "proc-macro2",
748
746
  "quote",
749
- "syn 2.0.38",
747
+ "syn",
750
748
  ]
751
749
 
752
750
  [[package]]
753
751
  name = "tokenizers"
754
- version = "0.4.3"
752
+ version = "0.5.0"
755
753
  dependencies = [
756
754
  "magnus",
757
755
  "onig",
758
756
  "serde",
759
- "tokenizers 0.15.0",
757
+ "tokenizers 0.19.1",
760
758
  ]
761
759
 
762
760
  [[package]]
763
761
  name = "tokenizers"
764
- version = "0.15.0"
762
+ version = "0.19.1"
765
763
  source = "registry+https://github.com/rust-lang/crates.io-index"
766
- checksum = "062b8a9613d6017633b80fb55fbb33f1aff006c36225a3025630753398034b3c"
764
+ checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
767
765
  dependencies = [
768
766
  "aho-corasick",
769
767
  "derive_builder",
770
768
  "esaxx-rs",
771
769
  "getrandom",
772
770
  "indicatif",
773
- "itertools",
771
+ "itertools 0.12.1",
774
772
  "lazy_static",
775
773
  "log",
776
774
  "macro_rules_attribute",
@@ -808,9 +806,9 @@ dependencies = [
808
806
 
809
807
  [[package]]
810
808
  name = "unicode-segmentation"
811
- version = "1.10.1"
809
+ version = "1.11.0"
812
810
  source = "registry+https://github.com/rust-lang/crates.io-index"
813
- checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
811
+ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
814
812
 
815
813
  [[package]]
816
814
  name = "unicode-width"
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/tokenizers-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tokenizers-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/tokenizers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tokenizers-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -1,10 +1,10 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.3"
3
+ version = "0.5.0"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
7
- rust-version = "1.62.0"
7
+ rust-version = "1.63.0"
8
8
  publish = false
9
9
 
10
10
  [lib]
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.0" # also update in from_pretrained.rb
19
+ version = "=0.19.1" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -1,5 +1,6 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
+ use crate::pre_tokenizers::from_string;
3
4
  use magnus::value::Lazy;
4
5
  use magnus::{
5
6
  data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
11
12
  use tk::decoders::byte_level::ByteLevel;
12
13
  use tk::decoders::ctc::CTC;
13
14
  use tk::decoders::fuse::Fuse;
14
- use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::metaspace::{Metaspace, PrependScheme};
15
16
  use tk::decoders::strip::Strip;
16
17
  use tk::decoders::wordpiece::WordPiece;
17
18
  use tk::decoders::DecoderWrapper;
@@ -126,12 +127,29 @@ impl RbDecoder {
126
127
  setter!(self, Metaspace, @set_replacement, replacement);
127
128
  }
128
129
 
129
- pub fn metaspace_add_prefix_space(&self) -> bool {
130
- getter!(self, Metaspace, add_prefix_space)
130
+ pub fn metaspace_split(&self) -> bool {
131
+ getter!(self, Metaspace, get_split())
131
132
  }
132
133
 
133
- pub fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
134
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
134
+ pub fn metaspace_set_split(&self, split: bool) {
135
+ setter!(self, Metaspace, @set_split, split);
136
+ }
137
+
138
+ pub fn metaspace_prepend_scheme(&self) -> String {
139
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
140
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
141
+ match scheme {
142
+ PrependScheme::First => "first",
143
+ PrependScheme::Never => "never",
144
+ PrependScheme::Always => "always",
145
+ }
146
+ .to_string()
147
+ }
148
+
149
+ pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
150
+ let scheme = from_string(prepend_scheme)?;
151
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
152
+ Ok(())
135
153
  }
136
154
 
137
155
  pub fn word_piece_cleanup(&self) -> bool {
@@ -194,8 +212,9 @@ impl RbFuse {
194
212
  pub struct RbMetaspaceDecoder {}
195
213
 
196
214
  impl RbMetaspaceDecoder {
197
- pub fn new(replacement: char, add_prefix_space: bool) -> RbDecoder {
198
- Metaspace::new(replacement, add_prefix_space).into()
215
+ pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
216
+ let prepend_scheme = from_string(prepend_scheme)?;
217
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
199
218
  }
200
219
  }
201
220
 
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
364
383
  class.define_singleton_method("new", function!(RbFuse::new, 0))?;
365
384
 
366
385
  let class = module.define_class("Metaspace", decoder)?;
367
- class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
368
- class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
369
- class.define_method("add_prefix_space=", method!(RbDecoder::metaspace_set_add_prefix_space, 1))?;
386
+ class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
387
+ class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
388
+ class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
370
389
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
371
390
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
391
+ class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
392
+ class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
372
393
 
373
394
  let class = module.define_class("Replace", decoder)?;
374
395
  class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
@@ -1,7 +1,7 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
3
  use magnus::{
4
- data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
4
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
5
5
  RArray, RClass, RModule, Ruby, TryConvert, TypedData,
6
6
  };
7
7
 
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
12
12
  use tk::pre_tokenizers::byte_level::ByteLevel;
13
13
  use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
14
14
  use tk::pre_tokenizers::digits::Digits;
15
- use tk::pre_tokenizers::metaspace::Metaspace;
15
+ use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
16
16
  use tk::pre_tokenizers::punctuation::Punctuation;
17
17
  use tk::pre_tokenizers::split::Split;
18
18
  use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
118
118
  setter!(self, Digits, individual_digits, individual_digits);
119
119
  }
120
120
 
121
- fn metaspace_add_prefix_space(&self) -> bool {
122
- getter!(self, Metaspace, add_prefix_space)
123
- }
124
-
125
- fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
126
- setter!(self, Metaspace, add_prefix_space, add_prefix_space);
127
- }
128
-
129
121
  fn metaspace_replacement(&self) -> String {
130
122
  getter!(self, Metaspace, get_replacement().to_string())
131
123
  }
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
133
125
  fn metaspace_set_replacement(&self, replacement: char) {
134
126
  setter!(self, Metaspace, @set_replacement, replacement);
135
127
  }
128
+
129
+ fn metaspace_split(&self) -> bool {
130
+ getter!(self, Metaspace, get_split())
131
+ }
132
+
133
+ fn metaspace_set_split(&self, split: bool) {
134
+ setter!(self, Metaspace, @set_split, split);
135
+ }
136
+
137
+ fn metaspace_prepend_scheme(&self) -> String {
138
+ // Assuming Metaspace has a method to get the prepend_scheme as a string
139
+ let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
140
+ match scheme {
141
+ PrependScheme::First => "first",
142
+ PrependScheme::Never => "never",
143
+ PrependScheme::Always => "always",
144
+ }
145
+ .to_string()
146
+ }
147
+
148
+ fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
149
+ let scheme = from_string(prepend_scheme)?;
150
+ setter!(self, Metaspace, @set_prepend_scheme, scheme);
151
+ Ok(())
152
+ }
136
153
  }
137
154
 
138
155
  impl PreTokenizer for RbPreTokenizer {
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
180
197
  impl RbMetaspace {
181
198
  fn new(
182
199
  replacement: char,
183
- add_prefix_space: bool,
184
- ) -> RbPreTokenizer {
185
- Metaspace::new(replacement, add_prefix_space).into()
200
+ prepend_scheme: String,
201
+ split: bool,
202
+ ) -> RbResult<RbPreTokenizer> {
203
+ let prepend_scheme = from_string(prepend_scheme)?;
204
+ Ok(Metaspace::new(replacement, prepend_scheme, split).into())
186
205
  }
187
206
  }
188
207
 
@@ -252,6 +271,21 @@ impl RbSequence {
252
271
  }
253
272
  }
254
273
 
274
+ pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
275
+ let scheme = match string.as_str() {
276
+ "first" => PrependScheme::First,
277
+ "never" => PrependScheme::Never,
278
+ "always" => PrependScheme::Always,
279
+ _ => {
280
+ return Err(Error::new(exception::arg_error(), format!(
281
+ "{} is an unknown variant, should be one of ['first', 'never', 'always']",
282
+ string
283
+ )));
284
+ }
285
+ };
286
+ Ok(scheme)
287
+ }
288
+
255
289
  #[derive(Clone, Deserialize)]
256
290
  #[serde(untagged)]
257
291
  pub(crate) enum RbPreTokenizerWrapper {
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
465
499
  class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
466
500
 
467
501
  let class = module.define_class("Metaspace", pre_tokenizer)?;
468
- class.define_singleton_method("_new", function!(RbMetaspace::new, 2))?;
469
- class.define_method("add_prefix_space", method!(RbPreTokenizer::metaspace_add_prefix_space, 0))?;
470
- class.define_method("add_prefix_space=", method!(RbPreTokenizer::metaspace_set_add_prefix_space, 1))?;
502
+ class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
503
+ class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
504
+ class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
471
505
  class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
472
506
  class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
507
+ class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
508
+ class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
473
509
 
474
510
  let class = module.define_class("Punctuation", pre_tokenizer)?;
475
511
  class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
@@ -77,11 +77,11 @@ impl RbTrainer {
77
77
  setter!(self, BpeTrainer, vocab_size, vocab_size);
78
78
  }
79
79
 
80
- fn bpe_trainer_min_frequency(&self) -> u32 {
80
+ fn bpe_trainer_min_frequency(&self) -> u64 {
81
81
  getter!(self, BpeTrainer, min_frequency)
82
82
  }
83
83
 
84
- fn bpe_trainer_set_min_frequency(&self, freq: u32) {
84
+ fn bpe_trainer_set_min_frequency(&self, freq: u64) {
85
85
  setter!(self, BpeTrainer, min_frequency, freq);
86
86
  }
87
87
 
@@ -235,11 +235,11 @@ impl RbTrainer {
235
235
  setter!(self, WordLevelTrainer, vocab_size, vocab_size);
236
236
  }
237
237
 
238
- fn word_level_trainer_min_frequency(&self) -> u32 {
238
+ fn word_level_trainer_min_frequency(&self) -> u64 {
239
239
  getter!(self, WordLevelTrainer, min_frequency)
240
240
  }
241
241
 
242
- fn word_level_trainer_set_min_frequency(&self, freq: u32) {
242
+ fn word_level_trainer_set_min_frequency(&self, freq: u64) {
243
243
  setter!(self, WordLevelTrainer, min_frequency, freq);
244
244
  }
245
245
 
@@ -289,11 +289,11 @@ impl RbTrainer {
289
289
  setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
290
290
  }
291
291
 
292
- fn word_piece_trainer_min_frequency(&self) -> u32 {
292
+ fn word_piece_trainer_min_frequency(&self) -> u64 {
293
293
  getter!(self, WordPieceTrainer, min_frequency())
294
294
  }
295
295
 
296
- fn word_piece_trainer_set_min_frequency(&self, freq: u32) {
296
+ fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
297
297
  setter!(self, WordPieceTrainer, @set_min_frequency, freq);
298
298
  }
299
299
 
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Decoders
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.0"
4
+ TOKENIZERS_VERSION = "0.19.1"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module PreTokenizers
3
3
  class Metaspace
4
- def self.new(replacement: "\u2581", add_prefix_space: true)
5
- _new(replacement, add_prefix_space)
4
+ def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
5
+ _new(replacement, prepend_scheme, split)
6
6
  end
7
7
  end
8
8
  end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.3"
2
+ VERSION = "0.5.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-01-04 00:00:00.000000000 Z
11
+ date: 2024-05-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
- version: '3'
96
+ version: '3.1'
97
97
  required_rubygems_version: !ruby/object:Gem::Requirement
98
98
  requirements:
99
99
  - - ">="
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.5.3
103
+ rubygems_version: 3.5.9
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby