tokenizers 0.4.3 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Cargo.lock +65 -67
- data/README.md +1 -1
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/decoders.rs +31 -10
- data/ext/tokenizers/src/pre_tokenizers.rs +52 -16
- data/ext/tokenizers/src/trainers.rs +6 -6
- data/lib/tokenizers/decoders/metaspace.rb +2 -2
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/pre_tokenizers/metaspace.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
|
4
|
+
data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
|
7
|
+
data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## 0.5.0 (2024-05-21)
|
2
|
+
|
3
|
+
- Updated Tokenizers to 0.19.1
|
4
|
+
- Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
|
5
|
+
- Dropped support for Ruby < 3.1
|
6
|
+
|
7
|
+
## 0.4.4 (2024-02-27)
|
8
|
+
|
9
|
+
- Updated Tokenizers to 0.15.2
|
10
|
+
|
1
11
|
## 0.4.3 (2024-01-03)
|
2
12
|
|
3
13
|
- Added support for Ruby 3.3
|
data/Cargo.lock
CHANGED
@@ -40,7 +40,7 @@ dependencies = [
|
|
40
40
|
"regex",
|
41
41
|
"rustc-hash",
|
42
42
|
"shlex",
|
43
|
-
"syn
|
43
|
+
"syn",
|
44
44
|
]
|
45
45
|
|
46
46
|
[[package]]
|
@@ -135,9 +135,9 @@ dependencies = [
|
|
135
135
|
|
136
136
|
[[package]]
|
137
137
|
name = "darling"
|
138
|
-
version = "0.
|
138
|
+
version = "0.20.8"
|
139
139
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
140
|
-
checksum = "
|
140
|
+
checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
|
141
141
|
dependencies = [
|
142
142
|
"darling_core",
|
143
143
|
"darling_macro",
|
@@ -145,58 +145,58 @@ dependencies = [
|
|
145
145
|
|
146
146
|
[[package]]
|
147
147
|
name = "darling_core"
|
148
|
-
version = "0.
|
148
|
+
version = "0.20.8"
|
149
149
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
150
|
-
checksum = "
|
150
|
+
checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
|
151
151
|
dependencies = [
|
152
152
|
"fnv",
|
153
153
|
"ident_case",
|
154
154
|
"proc-macro2",
|
155
155
|
"quote",
|
156
156
|
"strsim",
|
157
|
-
"syn
|
157
|
+
"syn",
|
158
158
|
]
|
159
159
|
|
160
160
|
[[package]]
|
161
161
|
name = "darling_macro"
|
162
|
-
version = "0.
|
162
|
+
version = "0.20.8"
|
163
163
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
164
|
-
checksum = "
|
164
|
+
checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
|
165
165
|
dependencies = [
|
166
166
|
"darling_core",
|
167
167
|
"quote",
|
168
|
-
"syn
|
168
|
+
"syn",
|
169
169
|
]
|
170
170
|
|
171
171
|
[[package]]
|
172
172
|
name = "derive_builder"
|
173
|
-
version = "0.
|
173
|
+
version = "0.20.0"
|
174
174
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
175
|
-
checksum = "
|
175
|
+
checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
|
176
176
|
dependencies = [
|
177
177
|
"derive_builder_macro",
|
178
178
|
]
|
179
179
|
|
180
180
|
[[package]]
|
181
181
|
name = "derive_builder_core"
|
182
|
-
version = "0.
|
182
|
+
version = "0.20.0"
|
183
183
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
184
|
-
checksum = "
|
184
|
+
checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
|
185
185
|
dependencies = [
|
186
186
|
"darling",
|
187
187
|
"proc-macro2",
|
188
188
|
"quote",
|
189
|
-
"syn
|
189
|
+
"syn",
|
190
190
|
]
|
191
191
|
|
192
192
|
[[package]]
|
193
193
|
name = "derive_builder_macro"
|
194
|
-
version = "0.
|
194
|
+
version = "0.20.0"
|
195
195
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
196
|
-
checksum = "
|
196
|
+
checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
|
197
197
|
dependencies = [
|
198
198
|
"derive_builder_core",
|
199
|
-
"syn
|
199
|
+
"syn",
|
200
200
|
]
|
201
201
|
|
202
202
|
[[package]]
|
@@ -280,6 +280,15 @@ dependencies = [
|
|
280
280
|
"either",
|
281
281
|
]
|
282
282
|
|
283
|
+
[[package]]
|
284
|
+
name = "itertools"
|
285
|
+
version = "0.12.1"
|
286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
287
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
288
|
+
dependencies = [
|
289
|
+
"either",
|
290
|
+
]
|
291
|
+
|
283
292
|
[[package]]
|
284
293
|
name = "itoa"
|
285
294
|
version = "1.0.6"
|
@@ -341,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
341
350
|
|
342
351
|
[[package]]
|
343
352
|
name = "magnus"
|
344
|
-
version = "0.6.
|
353
|
+
version = "0.6.4"
|
345
354
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
346
|
-
checksum = "
|
355
|
+
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
347
356
|
dependencies = [
|
348
357
|
"magnus-macros",
|
349
358
|
"rb-sys",
|
@@ -359,7 +368,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
|
359
368
|
dependencies = [
|
360
369
|
"proc-macro2",
|
361
370
|
"quote",
|
362
|
-
"syn
|
371
|
+
"syn",
|
363
372
|
]
|
364
373
|
|
365
374
|
[[package]]
|
@@ -385,9 +394,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
385
394
|
|
386
395
|
[[package]]
|
387
396
|
name = "monostate"
|
388
|
-
version = "0.1.
|
397
|
+
version = "0.1.12"
|
389
398
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
390
|
-
checksum = "
|
399
|
+
checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
|
391
400
|
dependencies = [
|
392
401
|
"monostate-impl",
|
393
402
|
"serde",
|
@@ -395,13 +404,13 @@ dependencies = [
|
|
395
404
|
|
396
405
|
[[package]]
|
397
406
|
name = "monostate-impl"
|
398
|
-
version = "0.1.
|
407
|
+
version = "0.1.12"
|
399
408
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
400
|
-
checksum = "
|
409
|
+
checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
|
401
410
|
dependencies = [
|
402
411
|
"proc-macro2",
|
403
412
|
"quote",
|
404
|
-
"syn
|
413
|
+
"syn",
|
405
414
|
]
|
406
415
|
|
407
416
|
[[package]]
|
@@ -480,18 +489,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
480
489
|
|
481
490
|
[[package]]
|
482
491
|
name = "proc-macro2"
|
483
|
-
version = "1.0.
|
492
|
+
version = "1.0.81"
|
484
493
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
485
|
-
checksum = "
|
494
|
+
checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
|
486
495
|
dependencies = [
|
487
496
|
"unicode-ident",
|
488
497
|
]
|
489
498
|
|
490
499
|
[[package]]
|
491
500
|
name = "quote"
|
492
|
-
version = "1.0.
|
501
|
+
version = "1.0.36"
|
493
502
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
494
|
-
checksum = "
|
503
|
+
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
495
504
|
dependencies = [
|
496
505
|
"proc-macro2",
|
497
506
|
]
|
@@ -528,9 +537,9 @@ dependencies = [
|
|
528
537
|
|
529
538
|
[[package]]
|
530
539
|
name = "rayon"
|
531
|
-
version = "1.
|
540
|
+
version = "1.10.0"
|
532
541
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
533
|
-
checksum = "
|
542
|
+
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
534
543
|
dependencies = [
|
535
544
|
"either",
|
536
545
|
"rayon-core",
|
@@ -543,15 +552,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
543
552
|
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
|
544
553
|
dependencies = [
|
545
554
|
"either",
|
546
|
-
"itertools",
|
555
|
+
"itertools 0.11.0",
|
547
556
|
"rayon",
|
548
557
|
]
|
549
558
|
|
550
559
|
[[package]]
|
551
560
|
name = "rayon-core"
|
552
|
-
version = "1.12.
|
561
|
+
version = "1.12.1"
|
553
562
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
554
|
-
checksum = "
|
563
|
+
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
555
564
|
dependencies = [
|
556
565
|
"crossbeam-deque",
|
557
566
|
"crossbeam-utils",
|
@@ -559,18 +568,18 @@ dependencies = [
|
|
559
568
|
|
560
569
|
[[package]]
|
561
570
|
name = "rb-sys"
|
562
|
-
version = "0.9.
|
571
|
+
version = "0.9.97"
|
563
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
564
|
-
checksum = "
|
573
|
+
checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
|
565
574
|
dependencies = [
|
566
575
|
"rb-sys-build",
|
567
576
|
]
|
568
577
|
|
569
578
|
[[package]]
|
570
579
|
name = "rb-sys-build"
|
571
|
-
version = "0.9.
|
580
|
+
version = "0.9.97"
|
572
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
573
|
-
checksum = "
|
582
|
+
checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
|
574
583
|
dependencies = [
|
575
584
|
"bindgen",
|
576
585
|
"lazy_static",
|
@@ -578,7 +587,7 @@ dependencies = [
|
|
578
587
|
"quote",
|
579
588
|
"regex",
|
580
589
|
"shell-words",
|
581
|
-
"syn
|
590
|
+
"syn",
|
582
591
|
]
|
583
592
|
|
584
593
|
[[package]]
|
@@ -589,9 +598,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
589
598
|
|
590
599
|
[[package]]
|
591
600
|
name = "regex"
|
592
|
-
version = "1.
|
601
|
+
version = "1.10.4"
|
593
602
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
594
|
-
checksum = "
|
603
|
+
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
|
595
604
|
dependencies = [
|
596
605
|
"aho-corasick",
|
597
606
|
"memchr",
|
@@ -601,9 +610,9 @@ dependencies = [
|
|
601
610
|
|
602
611
|
[[package]]
|
603
612
|
name = "regex-automata"
|
604
|
-
version = "0.
|
613
|
+
version = "0.4.6"
|
605
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
606
|
-
checksum = "
|
615
|
+
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
|
607
616
|
dependencies = [
|
608
617
|
"aho-corasick",
|
609
618
|
"memchr",
|
@@ -612,9 +621,9 @@ dependencies = [
|
|
612
621
|
|
613
622
|
[[package]]
|
614
623
|
name = "regex-syntax"
|
615
|
-
version = "0.
|
624
|
+
version = "0.8.2"
|
616
625
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
617
|
-
checksum = "
|
626
|
+
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
618
627
|
|
619
628
|
[[package]]
|
620
629
|
name = "rustc-hash"
|
@@ -657,7 +666,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
|
|
657
666
|
dependencies = [
|
658
667
|
"proc-macro2",
|
659
668
|
"quote",
|
660
|
-
"syn
|
669
|
+
"syn",
|
661
670
|
]
|
662
671
|
|
663
672
|
[[package]]
|
@@ -709,20 +718,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|
709
718
|
|
710
719
|
[[package]]
|
711
720
|
name = "syn"
|
712
|
-
version = "
|
713
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
714
|
-
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
715
|
-
dependencies = [
|
716
|
-
"proc-macro2",
|
717
|
-
"quote",
|
718
|
-
"unicode-ident",
|
719
|
-
]
|
720
|
-
|
721
|
-
[[package]]
|
722
|
-
name = "syn"
|
723
|
-
version = "2.0.38"
|
721
|
+
version = "2.0.59"
|
724
722
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
725
|
-
checksum = "
|
723
|
+
checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
|
726
724
|
dependencies = [
|
727
725
|
"proc-macro2",
|
728
726
|
"quote",
|
@@ -746,31 +744,31 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
|
|
746
744
|
dependencies = [
|
747
745
|
"proc-macro2",
|
748
746
|
"quote",
|
749
|
-
"syn
|
747
|
+
"syn",
|
750
748
|
]
|
751
749
|
|
752
750
|
[[package]]
|
753
751
|
name = "tokenizers"
|
754
|
-
version = "0.
|
752
|
+
version = "0.5.0"
|
755
753
|
dependencies = [
|
756
754
|
"magnus",
|
757
755
|
"onig",
|
758
756
|
"serde",
|
759
|
-
"tokenizers 0.
|
757
|
+
"tokenizers 0.19.1",
|
760
758
|
]
|
761
759
|
|
762
760
|
[[package]]
|
763
761
|
name = "tokenizers"
|
764
|
-
version = "0.
|
762
|
+
version = "0.19.1"
|
765
763
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
766
|
-
checksum = "
|
764
|
+
checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
|
767
765
|
dependencies = [
|
768
766
|
"aho-corasick",
|
769
767
|
"derive_builder",
|
770
768
|
"esaxx-rs",
|
771
769
|
"getrandom",
|
772
770
|
"indicatif",
|
773
|
-
"itertools",
|
771
|
+
"itertools 0.12.1",
|
774
772
|
"lazy_static",
|
775
773
|
"log",
|
776
774
|
"macro_rules_attribute",
|
@@ -808,9 +806,9 @@ dependencies = [
|
|
808
806
|
|
809
807
|
[[package]]
|
810
808
|
name = "unicode-segmentation"
|
811
|
-
version = "1.
|
809
|
+
version = "1.11.0"
|
812
810
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
813
|
-
checksum = "
|
811
|
+
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
814
812
|
|
815
813
|
[[package]]
|
816
814
|
name = "unicode-width"
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
:slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://github.com/ankane/tokenizers-ruby/workflows/build/badge.svg
|
5
|
+
[![Build Status](https://github.com/ankane/tokenizers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tokenizers-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.
|
3
|
+
version = "0.5.0"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
7
|
-
rust-version = "1.
|
7
|
+
rust-version = "1.63.0"
|
8
8
|
publish = false
|
9
9
|
|
10
10
|
[lib]
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.
|
19
|
+
version = "=0.19.1" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -1,5 +1,6 @@
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
2
2
|
|
3
|
+
use crate::pre_tokenizers::from_string;
|
3
4
|
use magnus::value::Lazy;
|
4
5
|
use magnus::{
|
5
6
|
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
|
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
|
|
11
12
|
use tk::decoders::byte_level::ByteLevel;
|
12
13
|
use tk::decoders::ctc::CTC;
|
13
14
|
use tk::decoders::fuse::Fuse;
|
14
|
-
use tk::decoders::metaspace::Metaspace;
|
15
|
+
use tk::decoders::metaspace::{Metaspace, PrependScheme};
|
15
16
|
use tk::decoders::strip::Strip;
|
16
17
|
use tk::decoders::wordpiece::WordPiece;
|
17
18
|
use tk::decoders::DecoderWrapper;
|
@@ -126,12 +127,29 @@ impl RbDecoder {
|
|
126
127
|
setter!(self, Metaspace, @set_replacement, replacement);
|
127
128
|
}
|
128
129
|
|
129
|
-
pub fn
|
130
|
-
getter!(self, Metaspace,
|
130
|
+
pub fn metaspace_split(&self) -> bool {
|
131
|
+
getter!(self, Metaspace, get_split())
|
131
132
|
}
|
132
133
|
|
133
|
-
pub fn
|
134
|
-
setter!(self, Metaspace,
|
134
|
+
pub fn metaspace_set_split(&self, split: bool) {
|
135
|
+
setter!(self, Metaspace, @set_split, split);
|
136
|
+
}
|
137
|
+
|
138
|
+
pub fn metaspace_prepend_scheme(&self) -> String {
|
139
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
140
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
141
|
+
match scheme {
|
142
|
+
PrependScheme::First => "first",
|
143
|
+
PrependScheme::Never => "never",
|
144
|
+
PrependScheme::Always => "always",
|
145
|
+
}
|
146
|
+
.to_string()
|
147
|
+
}
|
148
|
+
|
149
|
+
pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
150
|
+
let scheme = from_string(prepend_scheme)?;
|
151
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
152
|
+
Ok(())
|
135
153
|
}
|
136
154
|
|
137
155
|
pub fn word_piece_cleanup(&self) -> bool {
|
@@ -194,8 +212,9 @@ impl RbFuse {
|
|
194
212
|
pub struct RbMetaspaceDecoder {}
|
195
213
|
|
196
214
|
impl RbMetaspaceDecoder {
|
197
|
-
pub fn new(replacement: char,
|
198
|
-
|
215
|
+
pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
|
216
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
217
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
199
218
|
}
|
200
219
|
}
|
201
220
|
|
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
364
383
|
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
365
384
|
|
366
385
|
let class = module.define_class("Metaspace", decoder)?;
|
367
|
-
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new,
|
368
|
-
class.define_method("
|
369
|
-
class.define_method("
|
386
|
+
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
|
387
|
+
class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
|
388
|
+
class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
|
370
389
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
371
390
|
class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
|
391
|
+
class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
|
392
|
+
class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
|
372
393
|
|
373
394
|
let class = module.define_class("Replace", decoder)?;
|
374
395
|
class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
|
@@ -1,7 +1,7 @@
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
2
2
|
|
3
3
|
use magnus::{
|
4
|
-
data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
|
4
|
+
data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
|
5
5
|
RArray, RClass, RModule, Ruby, TryConvert, TypedData,
|
6
6
|
};
|
7
7
|
|
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
|
|
12
12
|
use tk::pre_tokenizers::byte_level::ByteLevel;
|
13
13
|
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
14
14
|
use tk::pre_tokenizers::digits::Digits;
|
15
|
-
use tk::pre_tokenizers::metaspace::Metaspace;
|
15
|
+
use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
|
16
16
|
use tk::pre_tokenizers::punctuation::Punctuation;
|
17
17
|
use tk::pre_tokenizers::split::Split;
|
18
18
|
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
|
|
118
118
|
setter!(self, Digits, individual_digits, individual_digits);
|
119
119
|
}
|
120
120
|
|
121
|
-
fn metaspace_add_prefix_space(&self) -> bool {
|
122
|
-
getter!(self, Metaspace, add_prefix_space)
|
123
|
-
}
|
124
|
-
|
125
|
-
fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
|
126
|
-
setter!(self, Metaspace, add_prefix_space, add_prefix_space);
|
127
|
-
}
|
128
|
-
|
129
121
|
fn metaspace_replacement(&self) -> String {
|
130
122
|
getter!(self, Metaspace, get_replacement().to_string())
|
131
123
|
}
|
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
|
|
133
125
|
fn metaspace_set_replacement(&self, replacement: char) {
|
134
126
|
setter!(self, Metaspace, @set_replacement, replacement);
|
135
127
|
}
|
128
|
+
|
129
|
+
fn metaspace_split(&self) -> bool {
|
130
|
+
getter!(self, Metaspace, get_split())
|
131
|
+
}
|
132
|
+
|
133
|
+
fn metaspace_set_split(&self, split: bool) {
|
134
|
+
setter!(self, Metaspace, @set_split, split);
|
135
|
+
}
|
136
|
+
|
137
|
+
fn metaspace_prepend_scheme(&self) -> String {
|
138
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
139
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
140
|
+
match scheme {
|
141
|
+
PrependScheme::First => "first",
|
142
|
+
PrependScheme::Never => "never",
|
143
|
+
PrependScheme::Always => "always",
|
144
|
+
}
|
145
|
+
.to_string()
|
146
|
+
}
|
147
|
+
|
148
|
+
fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
149
|
+
let scheme = from_string(prepend_scheme)?;
|
150
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
151
|
+
Ok(())
|
152
|
+
}
|
136
153
|
}
|
137
154
|
|
138
155
|
impl PreTokenizer for RbPreTokenizer {
|
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
|
|
180
197
|
impl RbMetaspace {
|
181
198
|
fn new(
|
182
199
|
replacement: char,
|
183
|
-
|
184
|
-
|
185
|
-
|
200
|
+
prepend_scheme: String,
|
201
|
+
split: bool,
|
202
|
+
) -> RbResult<RbPreTokenizer> {
|
203
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
204
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
186
205
|
}
|
187
206
|
}
|
188
207
|
|
@@ -252,6 +271,21 @@ impl RbSequence {
|
|
252
271
|
}
|
253
272
|
}
|
254
273
|
|
274
|
+
pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
|
275
|
+
let scheme = match string.as_str() {
|
276
|
+
"first" => PrependScheme::First,
|
277
|
+
"never" => PrependScheme::Never,
|
278
|
+
"always" => PrependScheme::Always,
|
279
|
+
_ => {
|
280
|
+
return Err(Error::new(exception::arg_error(), format!(
|
281
|
+
"{} is an unknown variant, should be one of ['first', 'never', 'always']",
|
282
|
+
string
|
283
|
+
)));
|
284
|
+
}
|
285
|
+
};
|
286
|
+
Ok(scheme)
|
287
|
+
}
|
288
|
+
|
255
289
|
#[derive(Clone, Deserialize)]
|
256
290
|
#[serde(untagged)]
|
257
291
|
pub(crate) enum RbPreTokenizerWrapper {
|
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
465
499
|
class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
|
466
500
|
|
467
501
|
let class = module.define_class("Metaspace", pre_tokenizer)?;
|
468
|
-
class.define_singleton_method("_new", function!(RbMetaspace::new,
|
469
|
-
class.define_method("
|
470
|
-
class.define_method("
|
502
|
+
class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
|
503
|
+
class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
|
504
|
+
class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
|
471
505
|
class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
|
472
506
|
class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
|
507
|
+
class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
|
508
|
+
class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
|
473
509
|
|
474
510
|
let class = module.define_class("Punctuation", pre_tokenizer)?;
|
475
511
|
class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
|
@@ -77,11 +77,11 @@ impl RbTrainer {
|
|
77
77
|
setter!(self, BpeTrainer, vocab_size, vocab_size);
|
78
78
|
}
|
79
79
|
|
80
|
-
fn bpe_trainer_min_frequency(&self) ->
|
80
|
+
fn bpe_trainer_min_frequency(&self) -> u64 {
|
81
81
|
getter!(self, BpeTrainer, min_frequency)
|
82
82
|
}
|
83
83
|
|
84
|
-
fn bpe_trainer_set_min_frequency(&self, freq:
|
84
|
+
fn bpe_trainer_set_min_frequency(&self, freq: u64) {
|
85
85
|
setter!(self, BpeTrainer, min_frequency, freq);
|
86
86
|
}
|
87
87
|
|
@@ -235,11 +235,11 @@ impl RbTrainer {
|
|
235
235
|
setter!(self, WordLevelTrainer, vocab_size, vocab_size);
|
236
236
|
}
|
237
237
|
|
238
|
-
fn word_level_trainer_min_frequency(&self) ->
|
238
|
+
fn word_level_trainer_min_frequency(&self) -> u64 {
|
239
239
|
getter!(self, WordLevelTrainer, min_frequency)
|
240
240
|
}
|
241
241
|
|
242
|
-
fn word_level_trainer_set_min_frequency(&self, freq:
|
242
|
+
fn word_level_trainer_set_min_frequency(&self, freq: u64) {
|
243
243
|
setter!(self, WordLevelTrainer, min_frequency, freq);
|
244
244
|
}
|
245
245
|
|
@@ -289,11 +289,11 @@ impl RbTrainer {
|
|
289
289
|
setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
|
290
290
|
}
|
291
291
|
|
292
|
-
fn word_piece_trainer_min_frequency(&self) ->
|
292
|
+
fn word_piece_trainer_min_frequency(&self) -> u64 {
|
293
293
|
getter!(self, WordPieceTrainer, min_frequency())
|
294
294
|
}
|
295
295
|
|
296
|
-
fn word_piece_trainer_set_min_frequency(&self, freq:
|
296
|
+
fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
|
297
297
|
setter!(self, WordPieceTrainer, @set_min_frequency, freq);
|
298
298
|
}
|
299
299
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module Decoders
|
3
3
|
class Metaspace
|
4
|
-
def self.new(replacement: "\u2581",
|
5
|
-
_new(replacement,
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
5
|
+
_new(replacement, prepend_scheme, split)
|
6
6
|
end
|
7
7
|
end
|
8
8
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module PreTokenizers
|
3
3
|
class Metaspace
|
4
|
-
def self.new(replacement: "\u2581",
|
5
|
-
_new(replacement,
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
5
|
+
_new(replacement, prepend_scheme, split)
|
6
6
|
end
|
7
7
|
end
|
8
8
|
end
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-05-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '3'
|
96
|
+
version: '3.1'
|
97
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
98
|
requirements:
|
99
99
|
- - ">="
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.5.
|
103
|
+
rubygems_version: 3.5.9
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|