tokenizers 0.4.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +55 -72
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/decoders.rs +31 -10
- data/ext/tokenizers/src/pre_tokenizers.rs +52 -16
- data/lib/tokenizers/decoders/metaspace.rb +2 -2
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/pre_tokenizers/metaspace.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
|
|
4
|
+
data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
|
|
7
|
+
data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
## 0.5.0 (2024-05-21)
|
|
2
|
+
|
|
3
|
+
- Updated Tokenizers to 0.19.1
|
|
4
|
+
- Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
|
|
5
|
+
- Dropped support for Ruby < 3.1
|
|
6
|
+
|
|
1
7
|
## 0.4.4 (2024-02-27)
|
|
2
8
|
|
|
3
9
|
- Updated Tokenizers to 0.15.2
|
data/Cargo.lock
CHANGED
|
@@ -40,7 +40,7 @@ dependencies = [
|
|
|
40
40
|
"regex",
|
|
41
41
|
"rustc-hash",
|
|
42
42
|
"shlex",
|
|
43
|
-
"syn
|
|
43
|
+
"syn",
|
|
44
44
|
]
|
|
45
45
|
|
|
46
46
|
[[package]]
|
|
@@ -135,9 +135,9 @@ dependencies = [
|
|
|
135
135
|
|
|
136
136
|
[[package]]
|
|
137
137
|
name = "darling"
|
|
138
|
-
version = "0.
|
|
138
|
+
version = "0.20.8"
|
|
139
139
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
140
|
-
checksum = "
|
|
140
|
+
checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
|
|
141
141
|
dependencies = [
|
|
142
142
|
"darling_core",
|
|
143
143
|
"darling_macro",
|
|
@@ -145,58 +145,58 @@ dependencies = [
|
|
|
145
145
|
|
|
146
146
|
[[package]]
|
|
147
147
|
name = "darling_core"
|
|
148
|
-
version = "0.
|
|
148
|
+
version = "0.20.8"
|
|
149
149
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
150
|
-
checksum = "
|
|
150
|
+
checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
|
|
151
151
|
dependencies = [
|
|
152
152
|
"fnv",
|
|
153
153
|
"ident_case",
|
|
154
154
|
"proc-macro2",
|
|
155
155
|
"quote",
|
|
156
156
|
"strsim",
|
|
157
|
-
"syn
|
|
157
|
+
"syn",
|
|
158
158
|
]
|
|
159
159
|
|
|
160
160
|
[[package]]
|
|
161
161
|
name = "darling_macro"
|
|
162
|
-
version = "0.
|
|
162
|
+
version = "0.20.8"
|
|
163
163
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
164
|
-
checksum = "
|
|
164
|
+
checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
|
|
165
165
|
dependencies = [
|
|
166
166
|
"darling_core",
|
|
167
167
|
"quote",
|
|
168
|
-
"syn
|
|
168
|
+
"syn",
|
|
169
169
|
]
|
|
170
170
|
|
|
171
171
|
[[package]]
|
|
172
172
|
name = "derive_builder"
|
|
173
|
-
version = "0.
|
|
173
|
+
version = "0.20.0"
|
|
174
174
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
175
|
-
checksum = "
|
|
175
|
+
checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
|
|
176
176
|
dependencies = [
|
|
177
177
|
"derive_builder_macro",
|
|
178
178
|
]
|
|
179
179
|
|
|
180
180
|
[[package]]
|
|
181
181
|
name = "derive_builder_core"
|
|
182
|
-
version = "0.
|
|
182
|
+
version = "0.20.0"
|
|
183
183
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
184
|
-
checksum = "
|
|
184
|
+
checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
|
|
185
185
|
dependencies = [
|
|
186
186
|
"darling",
|
|
187
187
|
"proc-macro2",
|
|
188
188
|
"quote",
|
|
189
|
-
"syn
|
|
189
|
+
"syn",
|
|
190
190
|
]
|
|
191
191
|
|
|
192
192
|
[[package]]
|
|
193
193
|
name = "derive_builder_macro"
|
|
194
|
-
version = "0.
|
|
194
|
+
version = "0.20.0"
|
|
195
195
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
196
|
-
checksum = "
|
|
196
|
+
checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
|
|
197
197
|
dependencies = [
|
|
198
198
|
"derive_builder_core",
|
|
199
|
-
"syn
|
|
199
|
+
"syn",
|
|
200
200
|
]
|
|
201
201
|
|
|
202
202
|
[[package]]
|
|
@@ -350,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
|
350
350
|
|
|
351
351
|
[[package]]
|
|
352
352
|
name = "magnus"
|
|
353
|
-
version = "0.6.
|
|
353
|
+
version = "0.6.4"
|
|
354
354
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
355
|
-
checksum = "
|
|
355
|
+
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
|
356
356
|
dependencies = [
|
|
357
357
|
"magnus-macros",
|
|
358
358
|
"rb-sys",
|
|
@@ -368,7 +368,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
|
|
368
368
|
dependencies = [
|
|
369
369
|
"proc-macro2",
|
|
370
370
|
"quote",
|
|
371
|
-
"syn
|
|
371
|
+
"syn",
|
|
372
372
|
]
|
|
373
373
|
|
|
374
374
|
[[package]]
|
|
@@ -394,9 +394,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
|
394
394
|
|
|
395
395
|
[[package]]
|
|
396
396
|
name = "monostate"
|
|
397
|
-
version = "0.1.
|
|
397
|
+
version = "0.1.12"
|
|
398
398
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
399
|
-
checksum = "
|
|
399
|
+
checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
|
|
400
400
|
dependencies = [
|
|
401
401
|
"monostate-impl",
|
|
402
402
|
"serde",
|
|
@@ -404,13 +404,13 @@ dependencies = [
|
|
|
404
404
|
|
|
405
405
|
[[package]]
|
|
406
406
|
name = "monostate-impl"
|
|
407
|
-
version = "0.1.
|
|
407
|
+
version = "0.1.12"
|
|
408
408
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
409
|
-
checksum = "
|
|
409
|
+
checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
|
|
410
410
|
dependencies = [
|
|
411
411
|
"proc-macro2",
|
|
412
412
|
"quote",
|
|
413
|
-
"syn
|
|
413
|
+
"syn",
|
|
414
414
|
]
|
|
415
415
|
|
|
416
416
|
[[package]]
|
|
@@ -489,18 +489,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
|
489
489
|
|
|
490
490
|
[[package]]
|
|
491
491
|
name = "proc-macro2"
|
|
492
|
-
version = "1.0.
|
|
492
|
+
version = "1.0.81"
|
|
493
493
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
494
|
-
checksum = "
|
|
494
|
+
checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
|
|
495
495
|
dependencies = [
|
|
496
496
|
"unicode-ident",
|
|
497
497
|
]
|
|
498
498
|
|
|
499
499
|
[[package]]
|
|
500
500
|
name = "quote"
|
|
501
|
-
version = "1.0.
|
|
501
|
+
version = "1.0.36"
|
|
502
502
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
503
|
-
checksum = "
|
|
503
|
+
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
|
504
504
|
dependencies = [
|
|
505
505
|
"proc-macro2",
|
|
506
506
|
]
|
|
@@ -537,9 +537,9 @@ dependencies = [
|
|
|
537
537
|
|
|
538
538
|
[[package]]
|
|
539
539
|
name = "rayon"
|
|
540
|
-
version = "1.
|
|
540
|
+
version = "1.10.0"
|
|
541
541
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
542
|
-
checksum = "
|
|
542
|
+
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
|
543
543
|
dependencies = [
|
|
544
544
|
"either",
|
|
545
545
|
"rayon-core",
|
|
@@ -558,9 +558,9 @@ dependencies = [
|
|
|
558
558
|
|
|
559
559
|
[[package]]
|
|
560
560
|
name = "rayon-core"
|
|
561
|
-
version = "1.12.
|
|
561
|
+
version = "1.12.1"
|
|
562
562
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
563
|
-
checksum = "
|
|
563
|
+
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
|
564
564
|
dependencies = [
|
|
565
565
|
"crossbeam-deque",
|
|
566
566
|
"crossbeam-utils",
|
|
@@ -568,18 +568,18 @@ dependencies = [
|
|
|
568
568
|
|
|
569
569
|
[[package]]
|
|
570
570
|
name = "rb-sys"
|
|
571
|
-
version = "0.9.
|
|
571
|
+
version = "0.9.97"
|
|
572
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
573
|
-
checksum = "
|
|
573
|
+
checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
|
|
574
574
|
dependencies = [
|
|
575
575
|
"rb-sys-build",
|
|
576
576
|
]
|
|
577
577
|
|
|
578
578
|
[[package]]
|
|
579
579
|
name = "rb-sys-build"
|
|
580
|
-
version = "0.9.
|
|
580
|
+
version = "0.9.97"
|
|
581
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
582
|
-
checksum = "
|
|
582
|
+
checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
|
|
583
583
|
dependencies = [
|
|
584
584
|
"bindgen",
|
|
585
585
|
"lazy_static",
|
|
@@ -587,7 +587,7 @@ dependencies = [
|
|
|
587
587
|
"quote",
|
|
588
588
|
"regex",
|
|
589
589
|
"shell-words",
|
|
590
|
-
"syn
|
|
590
|
+
"syn",
|
|
591
591
|
]
|
|
592
592
|
|
|
593
593
|
[[package]]
|
|
@@ -598,33 +598,27 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
|
598
598
|
|
|
599
599
|
[[package]]
|
|
600
600
|
name = "regex"
|
|
601
|
-
version = "1.
|
|
601
|
+
version = "1.10.4"
|
|
602
602
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
603
|
-
checksum = "
|
|
603
|
+
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
|
|
604
604
|
dependencies = [
|
|
605
605
|
"aho-corasick",
|
|
606
606
|
"memchr",
|
|
607
607
|
"regex-automata",
|
|
608
|
-
"regex-syntax
|
|
608
|
+
"regex-syntax",
|
|
609
609
|
]
|
|
610
610
|
|
|
611
611
|
[[package]]
|
|
612
612
|
name = "regex-automata"
|
|
613
|
-
version = "0.
|
|
613
|
+
version = "0.4.6"
|
|
614
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
615
|
-
checksum = "
|
|
615
|
+
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
|
|
616
616
|
dependencies = [
|
|
617
617
|
"aho-corasick",
|
|
618
618
|
"memchr",
|
|
619
|
-
"regex-syntax
|
|
619
|
+
"regex-syntax",
|
|
620
620
|
]
|
|
621
621
|
|
|
622
|
-
[[package]]
|
|
623
|
-
name = "regex-syntax"
|
|
624
|
-
version = "0.7.5"
|
|
625
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
626
|
-
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
|
627
|
-
|
|
628
622
|
[[package]]
|
|
629
623
|
name = "regex-syntax"
|
|
630
624
|
version = "0.8.2"
|
|
@@ -672,7 +666,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
|
|
|
672
666
|
dependencies = [
|
|
673
667
|
"proc-macro2",
|
|
674
668
|
"quote",
|
|
675
|
-
"syn
|
|
669
|
+
"syn",
|
|
676
670
|
]
|
|
677
671
|
|
|
678
672
|
[[package]]
|
|
@@ -724,20 +718,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|
|
724
718
|
|
|
725
719
|
[[package]]
|
|
726
720
|
name = "syn"
|
|
727
|
-
version = "
|
|
728
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
729
|
-
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
|
730
|
-
dependencies = [
|
|
731
|
-
"proc-macro2",
|
|
732
|
-
"quote",
|
|
733
|
-
"unicode-ident",
|
|
734
|
-
]
|
|
735
|
-
|
|
736
|
-
[[package]]
|
|
737
|
-
name = "syn"
|
|
738
|
-
version = "2.0.38"
|
|
721
|
+
version = "2.0.59"
|
|
739
722
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
740
|
-
checksum = "
|
|
723
|
+
checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
|
|
741
724
|
dependencies = [
|
|
742
725
|
"proc-macro2",
|
|
743
726
|
"quote",
|
|
@@ -761,24 +744,24 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
|
|
|
761
744
|
dependencies = [
|
|
762
745
|
"proc-macro2",
|
|
763
746
|
"quote",
|
|
764
|
-
"syn
|
|
747
|
+
"syn",
|
|
765
748
|
]
|
|
766
749
|
|
|
767
750
|
[[package]]
|
|
768
751
|
name = "tokenizers"
|
|
769
|
-
version = "0.
|
|
752
|
+
version = "0.5.0"
|
|
770
753
|
dependencies = [
|
|
771
754
|
"magnus",
|
|
772
755
|
"onig",
|
|
773
756
|
"serde",
|
|
774
|
-
"tokenizers 0.
|
|
757
|
+
"tokenizers 0.19.1",
|
|
775
758
|
]
|
|
776
759
|
|
|
777
760
|
[[package]]
|
|
778
761
|
name = "tokenizers"
|
|
779
|
-
version = "0.
|
|
762
|
+
version = "0.19.1"
|
|
780
763
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
781
|
-
checksum = "
|
|
764
|
+
checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
|
|
782
765
|
dependencies = [
|
|
783
766
|
"aho-corasick",
|
|
784
767
|
"derive_builder",
|
|
@@ -796,7 +779,7 @@ dependencies = [
|
|
|
796
779
|
"rayon",
|
|
797
780
|
"rayon-cond",
|
|
798
781
|
"regex",
|
|
799
|
-
"regex-syntax
|
|
782
|
+
"regex-syntax",
|
|
800
783
|
"serde",
|
|
801
784
|
"serde_json",
|
|
802
785
|
"spm_precompiled",
|
|
@@ -823,9 +806,9 @@ dependencies = [
|
|
|
823
806
|
|
|
824
807
|
[[package]]
|
|
825
808
|
name = "unicode-segmentation"
|
|
826
|
-
version = "1.
|
|
809
|
+
version = "1.11.0"
|
|
827
810
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
828
|
-
checksum = "
|
|
811
|
+
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
|
829
812
|
|
|
830
813
|
[[package]]
|
|
831
814
|
name = "unicode-width"
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.0"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
7
|
-
rust-version = "1.
|
|
7
|
+
rust-version = "1.63.0"
|
|
8
8
|
publish = false
|
|
9
9
|
|
|
10
10
|
[lib]
|
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
|
17
17
|
|
|
18
18
|
[dependencies.tokenizers]
|
|
19
|
-
version = "=0.
|
|
19
|
+
version = "=0.19.1" # also update in from_pretrained.rb
|
|
20
20
|
default-features = false
|
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
|
2
2
|
|
|
3
|
+
use crate::pre_tokenizers::from_string;
|
|
3
4
|
use magnus::value::Lazy;
|
|
4
5
|
use magnus::{
|
|
5
6
|
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
|
|
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
|
|
|
11
12
|
use tk::decoders::byte_level::ByteLevel;
|
|
12
13
|
use tk::decoders::ctc::CTC;
|
|
13
14
|
use tk::decoders::fuse::Fuse;
|
|
14
|
-
use tk::decoders::metaspace::Metaspace;
|
|
15
|
+
use tk::decoders::metaspace::{Metaspace, PrependScheme};
|
|
15
16
|
use tk::decoders::strip::Strip;
|
|
16
17
|
use tk::decoders::wordpiece::WordPiece;
|
|
17
18
|
use tk::decoders::DecoderWrapper;
|
|
@@ -126,12 +127,29 @@ impl RbDecoder {
|
|
|
126
127
|
setter!(self, Metaspace, @set_replacement, replacement);
|
|
127
128
|
}
|
|
128
129
|
|
|
129
|
-
pub fn
|
|
130
|
-
getter!(self, Metaspace,
|
|
130
|
+
pub fn metaspace_split(&self) -> bool {
|
|
131
|
+
getter!(self, Metaspace, get_split())
|
|
131
132
|
}
|
|
132
133
|
|
|
133
|
-
pub fn
|
|
134
|
-
setter!(self, Metaspace,
|
|
134
|
+
pub fn metaspace_set_split(&self, split: bool) {
|
|
135
|
+
setter!(self, Metaspace, @set_split, split);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
pub fn metaspace_prepend_scheme(&self) -> String {
|
|
139
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
|
140
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
|
141
|
+
match scheme {
|
|
142
|
+
PrependScheme::First => "first",
|
|
143
|
+
PrependScheme::Never => "never",
|
|
144
|
+
PrependScheme::Always => "always",
|
|
145
|
+
}
|
|
146
|
+
.to_string()
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
|
150
|
+
let scheme = from_string(prepend_scheme)?;
|
|
151
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
|
152
|
+
Ok(())
|
|
135
153
|
}
|
|
136
154
|
|
|
137
155
|
pub fn word_piece_cleanup(&self) -> bool {
|
|
@@ -194,8 +212,9 @@ impl RbFuse {
|
|
|
194
212
|
pub struct RbMetaspaceDecoder {}
|
|
195
213
|
|
|
196
214
|
impl RbMetaspaceDecoder {
|
|
197
|
-
pub fn new(replacement: char,
|
|
198
|
-
|
|
215
|
+
pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
|
|
216
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
|
217
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
|
199
218
|
}
|
|
200
219
|
}
|
|
201
220
|
|
|
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
364
383
|
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
|
365
384
|
|
|
366
385
|
let class = module.define_class("Metaspace", decoder)?;
|
|
367
|
-
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new,
|
|
368
|
-
class.define_method("
|
|
369
|
-
class.define_method("
|
|
386
|
+
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
|
|
387
|
+
class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
|
|
388
|
+
class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
|
|
370
389
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
|
371
390
|
class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
|
|
391
|
+
class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
|
|
392
|
+
class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
|
|
372
393
|
|
|
373
394
|
let class = module.define_class("Replace", decoder)?;
|
|
374
395
|
class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
|
2
2
|
|
|
3
3
|
use magnus::{
|
|
4
|
-
data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
|
|
4
|
+
data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
|
|
5
5
|
RArray, RClass, RModule, Ruby, TryConvert, TypedData,
|
|
6
6
|
};
|
|
7
7
|
|
|
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
|
|
|
12
12
|
use tk::pre_tokenizers::byte_level::ByteLevel;
|
|
13
13
|
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
|
14
14
|
use tk::pre_tokenizers::digits::Digits;
|
|
15
|
-
use tk::pre_tokenizers::metaspace::Metaspace;
|
|
15
|
+
use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
|
|
16
16
|
use tk::pre_tokenizers::punctuation::Punctuation;
|
|
17
17
|
use tk::pre_tokenizers::split::Split;
|
|
18
18
|
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
|
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
|
|
|
118
118
|
setter!(self, Digits, individual_digits, individual_digits);
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
fn metaspace_add_prefix_space(&self) -> bool {
|
|
122
|
-
getter!(self, Metaspace, add_prefix_space)
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
|
|
126
|
-
setter!(self, Metaspace, add_prefix_space, add_prefix_space);
|
|
127
|
-
}
|
|
128
|
-
|
|
129
121
|
fn metaspace_replacement(&self) -> String {
|
|
130
122
|
getter!(self, Metaspace, get_replacement().to_string())
|
|
131
123
|
}
|
|
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
|
|
|
133
125
|
fn metaspace_set_replacement(&self, replacement: char) {
|
|
134
126
|
setter!(self, Metaspace, @set_replacement, replacement);
|
|
135
127
|
}
|
|
128
|
+
|
|
129
|
+
fn metaspace_split(&self) -> bool {
|
|
130
|
+
getter!(self, Metaspace, get_split())
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
fn metaspace_set_split(&self, split: bool) {
|
|
134
|
+
setter!(self, Metaspace, @set_split, split);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
fn metaspace_prepend_scheme(&self) -> String {
|
|
138
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
|
139
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
|
140
|
+
match scheme {
|
|
141
|
+
PrependScheme::First => "first",
|
|
142
|
+
PrependScheme::Never => "never",
|
|
143
|
+
PrependScheme::Always => "always",
|
|
144
|
+
}
|
|
145
|
+
.to_string()
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
|
149
|
+
let scheme = from_string(prepend_scheme)?;
|
|
150
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
|
151
|
+
Ok(())
|
|
152
|
+
}
|
|
136
153
|
}
|
|
137
154
|
|
|
138
155
|
impl PreTokenizer for RbPreTokenizer {
|
|
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
|
|
|
180
197
|
impl RbMetaspace {
|
|
181
198
|
fn new(
|
|
182
199
|
replacement: char,
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
200
|
+
prepend_scheme: String,
|
|
201
|
+
split: bool,
|
|
202
|
+
) -> RbResult<RbPreTokenizer> {
|
|
203
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
|
204
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
|
186
205
|
}
|
|
187
206
|
}
|
|
188
207
|
|
|
@@ -252,6 +271,21 @@ impl RbSequence {
|
|
|
252
271
|
}
|
|
253
272
|
}
|
|
254
273
|
|
|
274
|
+
pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
|
|
275
|
+
let scheme = match string.as_str() {
|
|
276
|
+
"first" => PrependScheme::First,
|
|
277
|
+
"never" => PrependScheme::Never,
|
|
278
|
+
"always" => PrependScheme::Always,
|
|
279
|
+
_ => {
|
|
280
|
+
return Err(Error::new(exception::arg_error(), format!(
|
|
281
|
+
"{} is an unknown variant, should be one of ['first', 'never', 'always']",
|
|
282
|
+
string
|
|
283
|
+
)));
|
|
284
|
+
}
|
|
285
|
+
};
|
|
286
|
+
Ok(scheme)
|
|
287
|
+
}
|
|
288
|
+
|
|
255
289
|
#[derive(Clone, Deserialize)]
|
|
256
290
|
#[serde(untagged)]
|
|
257
291
|
pub(crate) enum RbPreTokenizerWrapper {
|
|
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
465
499
|
class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
|
|
466
500
|
|
|
467
501
|
let class = module.define_class("Metaspace", pre_tokenizer)?;
|
|
468
|
-
class.define_singleton_method("_new", function!(RbMetaspace::new,
|
|
469
|
-
class.define_method("
|
|
470
|
-
class.define_method("
|
|
502
|
+
class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
|
|
503
|
+
class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
|
|
504
|
+
class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
|
|
471
505
|
class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
|
|
472
506
|
class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
|
|
507
|
+
class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
|
|
508
|
+
class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
|
|
473
509
|
|
|
474
510
|
let class = module.define_class("Punctuation", pre_tokenizer)?;
|
|
475
511
|
class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module Tokenizers
|
|
2
2
|
module Decoders
|
|
3
3
|
class Metaspace
|
|
4
|
-
def self.new(replacement: "\u2581",
|
|
5
|
-
_new(replacement,
|
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
|
5
|
+
_new(replacement, prepend_scheme, split)
|
|
6
6
|
end
|
|
7
7
|
end
|
|
8
8
|
end
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module Tokenizers
|
|
2
2
|
module PreTokenizers
|
|
3
3
|
class Metaspace
|
|
4
|
-
def self.new(replacement: "\u2581",
|
|
5
|
-
_new(replacement,
|
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
|
5
|
+
_new(replacement, prepend_scheme, split)
|
|
6
6
|
end
|
|
7
7
|
end
|
|
8
8
|
end
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-05-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
93
93
|
requirements:
|
|
94
94
|
- - ">="
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '3'
|
|
96
|
+
version: '3.1'
|
|
97
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
98
|
requirements:
|
|
99
99
|
- - ">="
|
|
100
100
|
- !ruby/object:Gem::Version
|
|
101
101
|
version: '0'
|
|
102
102
|
requirements: []
|
|
103
|
-
rubygems_version: 3.5.
|
|
103
|
+
rubygems_version: 3.5.9
|
|
104
104
|
signing_key:
|
|
105
105
|
specification_version: 4
|
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|