tokenizers 0.4.4 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +55 -72
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/decoders.rs +31 -10
- data/ext/tokenizers/src/pre_tokenizers.rs +52 -16
- data/lib/tokenizers/decoders/metaspace.rb +2 -2
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/pre_tokenizers/metaspace.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
|
4
|
+
data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
|
7
|
+
data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## 0.5.0 (2024-05-21)
|
2
|
+
|
3
|
+
- Updated Tokenizers to 0.19.1
|
4
|
+
- Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
|
5
|
+
- Dropped support for Ruby < 3.1
|
6
|
+
|
1
7
|
## 0.4.4 (2024-02-27)
|
2
8
|
|
3
9
|
- Updated Tokenizers to 0.15.2
|
data/Cargo.lock
CHANGED
@@ -40,7 +40,7 @@ dependencies = [
|
|
40
40
|
"regex",
|
41
41
|
"rustc-hash",
|
42
42
|
"shlex",
|
43
|
-
"syn
|
43
|
+
"syn",
|
44
44
|
]
|
45
45
|
|
46
46
|
[[package]]
|
@@ -135,9 +135,9 @@ dependencies = [
|
|
135
135
|
|
136
136
|
[[package]]
|
137
137
|
name = "darling"
|
138
|
-
version = "0.
|
138
|
+
version = "0.20.8"
|
139
139
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
140
|
-
checksum = "
|
140
|
+
checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
|
141
141
|
dependencies = [
|
142
142
|
"darling_core",
|
143
143
|
"darling_macro",
|
@@ -145,58 +145,58 @@ dependencies = [
|
|
145
145
|
|
146
146
|
[[package]]
|
147
147
|
name = "darling_core"
|
148
|
-
version = "0.
|
148
|
+
version = "0.20.8"
|
149
149
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
150
|
-
checksum = "
|
150
|
+
checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
|
151
151
|
dependencies = [
|
152
152
|
"fnv",
|
153
153
|
"ident_case",
|
154
154
|
"proc-macro2",
|
155
155
|
"quote",
|
156
156
|
"strsim",
|
157
|
-
"syn
|
157
|
+
"syn",
|
158
158
|
]
|
159
159
|
|
160
160
|
[[package]]
|
161
161
|
name = "darling_macro"
|
162
|
-
version = "0.
|
162
|
+
version = "0.20.8"
|
163
163
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
164
|
-
checksum = "
|
164
|
+
checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
|
165
165
|
dependencies = [
|
166
166
|
"darling_core",
|
167
167
|
"quote",
|
168
|
-
"syn
|
168
|
+
"syn",
|
169
169
|
]
|
170
170
|
|
171
171
|
[[package]]
|
172
172
|
name = "derive_builder"
|
173
|
-
version = "0.
|
173
|
+
version = "0.20.0"
|
174
174
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
175
|
-
checksum = "
|
175
|
+
checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
|
176
176
|
dependencies = [
|
177
177
|
"derive_builder_macro",
|
178
178
|
]
|
179
179
|
|
180
180
|
[[package]]
|
181
181
|
name = "derive_builder_core"
|
182
|
-
version = "0.
|
182
|
+
version = "0.20.0"
|
183
183
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
184
|
-
checksum = "
|
184
|
+
checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
|
185
185
|
dependencies = [
|
186
186
|
"darling",
|
187
187
|
"proc-macro2",
|
188
188
|
"quote",
|
189
|
-
"syn
|
189
|
+
"syn",
|
190
190
|
]
|
191
191
|
|
192
192
|
[[package]]
|
193
193
|
name = "derive_builder_macro"
|
194
|
-
version = "0.
|
194
|
+
version = "0.20.0"
|
195
195
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
196
|
-
checksum = "
|
196
|
+
checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
|
197
197
|
dependencies = [
|
198
198
|
"derive_builder_core",
|
199
|
-
"syn
|
199
|
+
"syn",
|
200
200
|
]
|
201
201
|
|
202
202
|
[[package]]
|
@@ -350,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
350
350
|
|
351
351
|
[[package]]
|
352
352
|
name = "magnus"
|
353
|
-
version = "0.6.
|
353
|
+
version = "0.6.4"
|
354
354
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
355
|
-
checksum = "
|
355
|
+
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
356
356
|
dependencies = [
|
357
357
|
"magnus-macros",
|
358
358
|
"rb-sys",
|
@@ -368,7 +368,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
|
368
368
|
dependencies = [
|
369
369
|
"proc-macro2",
|
370
370
|
"quote",
|
371
|
-
"syn
|
371
|
+
"syn",
|
372
372
|
]
|
373
373
|
|
374
374
|
[[package]]
|
@@ -394,9 +394,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
394
394
|
|
395
395
|
[[package]]
|
396
396
|
name = "monostate"
|
397
|
-
version = "0.1.
|
397
|
+
version = "0.1.12"
|
398
398
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
399
|
-
checksum = "
|
399
|
+
checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
|
400
400
|
dependencies = [
|
401
401
|
"monostate-impl",
|
402
402
|
"serde",
|
@@ -404,13 +404,13 @@ dependencies = [
|
|
404
404
|
|
405
405
|
[[package]]
|
406
406
|
name = "monostate-impl"
|
407
|
-
version = "0.1.
|
407
|
+
version = "0.1.12"
|
408
408
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
409
|
-
checksum = "
|
409
|
+
checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
|
410
410
|
dependencies = [
|
411
411
|
"proc-macro2",
|
412
412
|
"quote",
|
413
|
-
"syn
|
413
|
+
"syn",
|
414
414
|
]
|
415
415
|
|
416
416
|
[[package]]
|
@@ -489,18 +489,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
489
489
|
|
490
490
|
[[package]]
|
491
491
|
name = "proc-macro2"
|
492
|
-
version = "1.0.
|
492
|
+
version = "1.0.81"
|
493
493
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
494
|
-
checksum = "
|
494
|
+
checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
|
495
495
|
dependencies = [
|
496
496
|
"unicode-ident",
|
497
497
|
]
|
498
498
|
|
499
499
|
[[package]]
|
500
500
|
name = "quote"
|
501
|
-
version = "1.0.
|
501
|
+
version = "1.0.36"
|
502
502
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
503
|
-
checksum = "
|
503
|
+
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
504
504
|
dependencies = [
|
505
505
|
"proc-macro2",
|
506
506
|
]
|
@@ -537,9 +537,9 @@ dependencies = [
|
|
537
537
|
|
538
538
|
[[package]]
|
539
539
|
name = "rayon"
|
540
|
-
version = "1.
|
540
|
+
version = "1.10.0"
|
541
541
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
542
|
-
checksum = "
|
542
|
+
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
543
543
|
dependencies = [
|
544
544
|
"either",
|
545
545
|
"rayon-core",
|
@@ -558,9 +558,9 @@ dependencies = [
|
|
558
558
|
|
559
559
|
[[package]]
|
560
560
|
name = "rayon-core"
|
561
|
-
version = "1.12.
|
561
|
+
version = "1.12.1"
|
562
562
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
563
|
-
checksum = "
|
563
|
+
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
564
564
|
dependencies = [
|
565
565
|
"crossbeam-deque",
|
566
566
|
"crossbeam-utils",
|
@@ -568,18 +568,18 @@ dependencies = [
|
|
568
568
|
|
569
569
|
[[package]]
|
570
570
|
name = "rb-sys"
|
571
|
-
version = "0.9.
|
571
|
+
version = "0.9.97"
|
572
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
573
|
-
checksum = "
|
573
|
+
checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
|
574
574
|
dependencies = [
|
575
575
|
"rb-sys-build",
|
576
576
|
]
|
577
577
|
|
578
578
|
[[package]]
|
579
579
|
name = "rb-sys-build"
|
580
|
-
version = "0.9.
|
580
|
+
version = "0.9.97"
|
581
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
582
|
-
checksum = "
|
582
|
+
checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
|
583
583
|
dependencies = [
|
584
584
|
"bindgen",
|
585
585
|
"lazy_static",
|
@@ -587,7 +587,7 @@ dependencies = [
|
|
587
587
|
"quote",
|
588
588
|
"regex",
|
589
589
|
"shell-words",
|
590
|
-
"syn
|
590
|
+
"syn",
|
591
591
|
]
|
592
592
|
|
593
593
|
[[package]]
|
@@ -598,33 +598,27 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
598
598
|
|
599
599
|
[[package]]
|
600
600
|
name = "regex"
|
601
|
-
version = "1.
|
601
|
+
version = "1.10.4"
|
602
602
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
603
|
-
checksum = "
|
603
|
+
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
|
604
604
|
dependencies = [
|
605
605
|
"aho-corasick",
|
606
606
|
"memchr",
|
607
607
|
"regex-automata",
|
608
|
-
"regex-syntax
|
608
|
+
"regex-syntax",
|
609
609
|
]
|
610
610
|
|
611
611
|
[[package]]
|
612
612
|
name = "regex-automata"
|
613
|
-
version = "0.
|
613
|
+
version = "0.4.6"
|
614
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
615
|
-
checksum = "
|
615
|
+
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
|
616
616
|
dependencies = [
|
617
617
|
"aho-corasick",
|
618
618
|
"memchr",
|
619
|
-
"regex-syntax
|
619
|
+
"regex-syntax",
|
620
620
|
]
|
621
621
|
|
622
|
-
[[package]]
|
623
|
-
name = "regex-syntax"
|
624
|
-
version = "0.7.5"
|
625
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
626
|
-
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
627
|
-
|
628
622
|
[[package]]
|
629
623
|
name = "regex-syntax"
|
630
624
|
version = "0.8.2"
|
@@ -672,7 +666,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
|
|
672
666
|
dependencies = [
|
673
667
|
"proc-macro2",
|
674
668
|
"quote",
|
675
|
-
"syn
|
669
|
+
"syn",
|
676
670
|
]
|
677
671
|
|
678
672
|
[[package]]
|
@@ -724,20 +718,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|
724
718
|
|
725
719
|
[[package]]
|
726
720
|
name = "syn"
|
727
|
-
version = "
|
728
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
729
|
-
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
730
|
-
dependencies = [
|
731
|
-
"proc-macro2",
|
732
|
-
"quote",
|
733
|
-
"unicode-ident",
|
734
|
-
]
|
735
|
-
|
736
|
-
[[package]]
|
737
|
-
name = "syn"
|
738
|
-
version = "2.0.38"
|
721
|
+
version = "2.0.59"
|
739
722
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
740
|
-
checksum = "
|
723
|
+
checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
|
741
724
|
dependencies = [
|
742
725
|
"proc-macro2",
|
743
726
|
"quote",
|
@@ -761,24 +744,24 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
|
|
761
744
|
dependencies = [
|
762
745
|
"proc-macro2",
|
763
746
|
"quote",
|
764
|
-
"syn
|
747
|
+
"syn",
|
765
748
|
]
|
766
749
|
|
767
750
|
[[package]]
|
768
751
|
name = "tokenizers"
|
769
|
-
version = "0.
|
752
|
+
version = "0.5.0"
|
770
753
|
dependencies = [
|
771
754
|
"magnus",
|
772
755
|
"onig",
|
773
756
|
"serde",
|
774
|
-
"tokenizers 0.
|
757
|
+
"tokenizers 0.19.1",
|
775
758
|
]
|
776
759
|
|
777
760
|
[[package]]
|
778
761
|
name = "tokenizers"
|
779
|
-
version = "0.
|
762
|
+
version = "0.19.1"
|
780
763
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
781
|
-
checksum = "
|
764
|
+
checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
|
782
765
|
dependencies = [
|
783
766
|
"aho-corasick",
|
784
767
|
"derive_builder",
|
@@ -796,7 +779,7 @@ dependencies = [
|
|
796
779
|
"rayon",
|
797
780
|
"rayon-cond",
|
798
781
|
"regex",
|
799
|
-
"regex-syntax
|
782
|
+
"regex-syntax",
|
800
783
|
"serde",
|
801
784
|
"serde_json",
|
802
785
|
"spm_precompiled",
|
@@ -823,9 +806,9 @@ dependencies = [
|
|
823
806
|
|
824
807
|
[[package]]
|
825
808
|
name = "unicode-segmentation"
|
826
|
-
version = "1.
|
809
|
+
version = "1.11.0"
|
827
810
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
828
|
-
checksum = "
|
811
|
+
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
829
812
|
|
830
813
|
[[package]]
|
831
814
|
name = "unicode-width"
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.
|
3
|
+
version = "0.5.0"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
7
|
-
rust-version = "1.
|
7
|
+
rust-version = "1.63.0"
|
8
8
|
publish = false
|
9
9
|
|
10
10
|
[lib]
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.
|
19
|
+
version = "=0.19.1" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -1,5 +1,6 @@
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
2
2
|
|
3
|
+
use crate::pre_tokenizers::from_string;
|
3
4
|
use magnus::value::Lazy;
|
4
5
|
use magnus::{
|
5
6
|
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
|
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
|
|
11
12
|
use tk::decoders::byte_level::ByteLevel;
|
12
13
|
use tk::decoders::ctc::CTC;
|
13
14
|
use tk::decoders::fuse::Fuse;
|
14
|
-
use tk::decoders::metaspace::Metaspace;
|
15
|
+
use tk::decoders::metaspace::{Metaspace, PrependScheme};
|
15
16
|
use tk::decoders::strip::Strip;
|
16
17
|
use tk::decoders::wordpiece::WordPiece;
|
17
18
|
use tk::decoders::DecoderWrapper;
|
@@ -126,12 +127,29 @@ impl RbDecoder {
|
|
126
127
|
setter!(self, Metaspace, @set_replacement, replacement);
|
127
128
|
}
|
128
129
|
|
129
|
-
pub fn
|
130
|
-
getter!(self, Metaspace,
|
130
|
+
pub fn metaspace_split(&self) -> bool {
|
131
|
+
getter!(self, Metaspace, get_split())
|
131
132
|
}
|
132
133
|
|
133
|
-
pub fn
|
134
|
-
setter!(self, Metaspace,
|
134
|
+
pub fn metaspace_set_split(&self, split: bool) {
|
135
|
+
setter!(self, Metaspace, @set_split, split);
|
136
|
+
}
|
137
|
+
|
138
|
+
pub fn metaspace_prepend_scheme(&self) -> String {
|
139
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
140
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
141
|
+
match scheme {
|
142
|
+
PrependScheme::First => "first",
|
143
|
+
PrependScheme::Never => "never",
|
144
|
+
PrependScheme::Always => "always",
|
145
|
+
}
|
146
|
+
.to_string()
|
147
|
+
}
|
148
|
+
|
149
|
+
pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
150
|
+
let scheme = from_string(prepend_scheme)?;
|
151
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
152
|
+
Ok(())
|
135
153
|
}
|
136
154
|
|
137
155
|
pub fn word_piece_cleanup(&self) -> bool {
|
@@ -194,8 +212,9 @@ impl RbFuse {
|
|
194
212
|
pub struct RbMetaspaceDecoder {}
|
195
213
|
|
196
214
|
impl RbMetaspaceDecoder {
|
197
|
-
pub fn new(replacement: char,
|
198
|
-
|
215
|
+
pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
|
216
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
217
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
199
218
|
}
|
200
219
|
}
|
201
220
|
|
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
364
383
|
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
365
384
|
|
366
385
|
let class = module.define_class("Metaspace", decoder)?;
|
367
|
-
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new,
|
368
|
-
class.define_method("
|
369
|
-
class.define_method("
|
386
|
+
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
|
387
|
+
class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
|
388
|
+
class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
|
370
389
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
371
390
|
class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
|
391
|
+
class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
|
392
|
+
class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
|
372
393
|
|
373
394
|
let class = module.define_class("Replace", decoder)?;
|
374
395
|
class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
|
@@ -1,7 +1,7 @@
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
2
2
|
|
3
3
|
use magnus::{
|
4
|
-
data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
|
4
|
+
data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
|
5
5
|
RArray, RClass, RModule, Ruby, TryConvert, TypedData,
|
6
6
|
};
|
7
7
|
|
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
|
|
12
12
|
use tk::pre_tokenizers::byte_level::ByteLevel;
|
13
13
|
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
14
14
|
use tk::pre_tokenizers::digits::Digits;
|
15
|
-
use tk::pre_tokenizers::metaspace::Metaspace;
|
15
|
+
use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
|
16
16
|
use tk::pre_tokenizers::punctuation::Punctuation;
|
17
17
|
use tk::pre_tokenizers::split::Split;
|
18
18
|
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
|
|
118
118
|
setter!(self, Digits, individual_digits, individual_digits);
|
119
119
|
}
|
120
120
|
|
121
|
-
fn metaspace_add_prefix_space(&self) -> bool {
|
122
|
-
getter!(self, Metaspace, add_prefix_space)
|
123
|
-
}
|
124
|
-
|
125
|
-
fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
|
126
|
-
setter!(self, Metaspace, add_prefix_space, add_prefix_space);
|
127
|
-
}
|
128
|
-
|
129
121
|
fn metaspace_replacement(&self) -> String {
|
130
122
|
getter!(self, Metaspace, get_replacement().to_string())
|
131
123
|
}
|
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
|
|
133
125
|
fn metaspace_set_replacement(&self, replacement: char) {
|
134
126
|
setter!(self, Metaspace, @set_replacement, replacement);
|
135
127
|
}
|
128
|
+
|
129
|
+
fn metaspace_split(&self) -> bool {
|
130
|
+
getter!(self, Metaspace, get_split())
|
131
|
+
}
|
132
|
+
|
133
|
+
fn metaspace_set_split(&self, split: bool) {
|
134
|
+
setter!(self, Metaspace, @set_split, split);
|
135
|
+
}
|
136
|
+
|
137
|
+
fn metaspace_prepend_scheme(&self) -> String {
|
138
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
139
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
140
|
+
match scheme {
|
141
|
+
PrependScheme::First => "first",
|
142
|
+
PrependScheme::Never => "never",
|
143
|
+
PrependScheme::Always => "always",
|
144
|
+
}
|
145
|
+
.to_string()
|
146
|
+
}
|
147
|
+
|
148
|
+
fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
149
|
+
let scheme = from_string(prepend_scheme)?;
|
150
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
151
|
+
Ok(())
|
152
|
+
}
|
136
153
|
}
|
137
154
|
|
138
155
|
impl PreTokenizer for RbPreTokenizer {
|
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
|
|
180
197
|
impl RbMetaspace {
|
181
198
|
fn new(
|
182
199
|
replacement: char,
|
183
|
-
|
184
|
-
|
185
|
-
|
200
|
+
prepend_scheme: String,
|
201
|
+
split: bool,
|
202
|
+
) -> RbResult<RbPreTokenizer> {
|
203
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
204
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
186
205
|
}
|
187
206
|
}
|
188
207
|
|
@@ -252,6 +271,21 @@ impl RbSequence {
|
|
252
271
|
}
|
253
272
|
}
|
254
273
|
|
274
|
+
pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
|
275
|
+
let scheme = match string.as_str() {
|
276
|
+
"first" => PrependScheme::First,
|
277
|
+
"never" => PrependScheme::Never,
|
278
|
+
"always" => PrependScheme::Always,
|
279
|
+
_ => {
|
280
|
+
return Err(Error::new(exception::arg_error(), format!(
|
281
|
+
"{} is an unknown variant, should be one of ['first', 'never', 'always']",
|
282
|
+
string
|
283
|
+
)));
|
284
|
+
}
|
285
|
+
};
|
286
|
+
Ok(scheme)
|
287
|
+
}
|
288
|
+
|
255
289
|
#[derive(Clone, Deserialize)]
|
256
290
|
#[serde(untagged)]
|
257
291
|
pub(crate) enum RbPreTokenizerWrapper {
|
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
465
499
|
class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
|
466
500
|
|
467
501
|
let class = module.define_class("Metaspace", pre_tokenizer)?;
|
468
|
-
class.define_singleton_method("_new", function!(RbMetaspace::new,
|
469
|
-
class.define_method("
|
470
|
-
class.define_method("
|
502
|
+
class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
|
503
|
+
class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
|
504
|
+
class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
|
471
505
|
class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
|
472
506
|
class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
|
507
|
+
class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
|
508
|
+
class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
|
473
509
|
|
474
510
|
let class = module.define_class("Punctuation", pre_tokenizer)?;
|
475
511
|
class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module Decoders
|
3
3
|
class Metaspace
|
4
|
-
def self.new(replacement: "\u2581",
|
5
|
-
_new(replacement,
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
5
|
+
_new(replacement, prepend_scheme, split)
|
6
6
|
end
|
7
7
|
end
|
8
8
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module PreTokenizers
|
3
3
|
class Metaspace
|
4
|
-
def self.new(replacement: "\u2581",
|
5
|
-
_new(replacement,
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
5
|
+
_new(replacement, prepend_scheme, split)
|
6
6
|
end
|
7
7
|
end
|
8
8
|
end
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-05-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '3'
|
96
|
+
version: '3.1'
|
97
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
98
|
requirements:
|
99
99
|
- - ">="
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.5.
|
103
|
+
rubygems_version: 3.5.9
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|