tokenizers 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Cargo.lock +65 -67
- data/README.md +1 -1
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/decoders.rs +31 -10
- data/ext/tokenizers/src/pre_tokenizers.rs +52 -16
- data/ext/tokenizers/src/trainers.rs +6 -6
- data/lib/tokenizers/decoders/metaspace.rb +2 -2
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/pre_tokenizers/metaspace.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
|
|
4
|
+
data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
|
|
7
|
+
data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
## 0.5.0 (2024-05-21)
|
|
2
|
+
|
|
3
|
+
- Updated Tokenizers to 0.19.1
|
|
4
|
+
- Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
|
|
5
|
+
- Dropped support for Ruby < 3.1
|
|
6
|
+
|
|
7
|
+
## 0.4.4 (2024-02-27)
|
|
8
|
+
|
|
9
|
+
- Updated Tokenizers to 0.15.2
|
|
10
|
+
|
|
1
11
|
## 0.4.3 (2024-01-03)
|
|
2
12
|
|
|
3
13
|
- Added support for Ruby 3.3
|
data/Cargo.lock
CHANGED
|
@@ -40,7 +40,7 @@ dependencies = [
|
|
|
40
40
|
"regex",
|
|
41
41
|
"rustc-hash",
|
|
42
42
|
"shlex",
|
|
43
|
-
"syn
|
|
43
|
+
"syn",
|
|
44
44
|
]
|
|
45
45
|
|
|
46
46
|
[[package]]
|
|
@@ -135,9 +135,9 @@ dependencies = [
|
|
|
135
135
|
|
|
136
136
|
[[package]]
|
|
137
137
|
name = "darling"
|
|
138
|
-
version = "0.
|
|
138
|
+
version = "0.20.8"
|
|
139
139
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
140
|
-
checksum = "
|
|
140
|
+
checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
|
|
141
141
|
dependencies = [
|
|
142
142
|
"darling_core",
|
|
143
143
|
"darling_macro",
|
|
@@ -145,58 +145,58 @@ dependencies = [
|
|
|
145
145
|
|
|
146
146
|
[[package]]
|
|
147
147
|
name = "darling_core"
|
|
148
|
-
version = "0.
|
|
148
|
+
version = "0.20.8"
|
|
149
149
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
150
|
-
checksum = "
|
|
150
|
+
checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
|
|
151
151
|
dependencies = [
|
|
152
152
|
"fnv",
|
|
153
153
|
"ident_case",
|
|
154
154
|
"proc-macro2",
|
|
155
155
|
"quote",
|
|
156
156
|
"strsim",
|
|
157
|
-
"syn
|
|
157
|
+
"syn",
|
|
158
158
|
]
|
|
159
159
|
|
|
160
160
|
[[package]]
|
|
161
161
|
name = "darling_macro"
|
|
162
|
-
version = "0.
|
|
162
|
+
version = "0.20.8"
|
|
163
163
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
164
|
-
checksum = "
|
|
164
|
+
checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
|
|
165
165
|
dependencies = [
|
|
166
166
|
"darling_core",
|
|
167
167
|
"quote",
|
|
168
|
-
"syn
|
|
168
|
+
"syn",
|
|
169
169
|
]
|
|
170
170
|
|
|
171
171
|
[[package]]
|
|
172
172
|
name = "derive_builder"
|
|
173
|
-
version = "0.
|
|
173
|
+
version = "0.20.0"
|
|
174
174
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
175
|
-
checksum = "
|
|
175
|
+
checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
|
|
176
176
|
dependencies = [
|
|
177
177
|
"derive_builder_macro",
|
|
178
178
|
]
|
|
179
179
|
|
|
180
180
|
[[package]]
|
|
181
181
|
name = "derive_builder_core"
|
|
182
|
-
version = "0.
|
|
182
|
+
version = "0.20.0"
|
|
183
183
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
184
|
-
checksum = "
|
|
184
|
+
checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
|
|
185
185
|
dependencies = [
|
|
186
186
|
"darling",
|
|
187
187
|
"proc-macro2",
|
|
188
188
|
"quote",
|
|
189
|
-
"syn
|
|
189
|
+
"syn",
|
|
190
190
|
]
|
|
191
191
|
|
|
192
192
|
[[package]]
|
|
193
193
|
name = "derive_builder_macro"
|
|
194
|
-
version = "0.
|
|
194
|
+
version = "0.20.0"
|
|
195
195
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
196
|
-
checksum = "
|
|
196
|
+
checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
|
|
197
197
|
dependencies = [
|
|
198
198
|
"derive_builder_core",
|
|
199
|
-
"syn
|
|
199
|
+
"syn",
|
|
200
200
|
]
|
|
201
201
|
|
|
202
202
|
[[package]]
|
|
@@ -280,6 +280,15 @@ dependencies = [
|
|
|
280
280
|
"either",
|
|
281
281
|
]
|
|
282
282
|
|
|
283
|
+
[[package]]
|
|
284
|
+
name = "itertools"
|
|
285
|
+
version = "0.12.1"
|
|
286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
287
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
|
288
|
+
dependencies = [
|
|
289
|
+
"either",
|
|
290
|
+
]
|
|
291
|
+
|
|
283
292
|
[[package]]
|
|
284
293
|
name = "itoa"
|
|
285
294
|
version = "1.0.6"
|
|
@@ -341,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
|
341
350
|
|
|
342
351
|
[[package]]
|
|
343
352
|
name = "magnus"
|
|
344
|
-
version = "0.6.
|
|
353
|
+
version = "0.6.4"
|
|
345
354
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
346
|
-
checksum = "
|
|
355
|
+
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
|
347
356
|
dependencies = [
|
|
348
357
|
"magnus-macros",
|
|
349
358
|
"rb-sys",
|
|
@@ -359,7 +368,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
|
|
359
368
|
dependencies = [
|
|
360
369
|
"proc-macro2",
|
|
361
370
|
"quote",
|
|
362
|
-
"syn
|
|
371
|
+
"syn",
|
|
363
372
|
]
|
|
364
373
|
|
|
365
374
|
[[package]]
|
|
@@ -385,9 +394,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
|
385
394
|
|
|
386
395
|
[[package]]
|
|
387
396
|
name = "monostate"
|
|
388
|
-
version = "0.1.
|
|
397
|
+
version = "0.1.12"
|
|
389
398
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
390
|
-
checksum = "
|
|
399
|
+
checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
|
|
391
400
|
dependencies = [
|
|
392
401
|
"monostate-impl",
|
|
393
402
|
"serde",
|
|
@@ -395,13 +404,13 @@ dependencies = [
|
|
|
395
404
|
|
|
396
405
|
[[package]]
|
|
397
406
|
name = "monostate-impl"
|
|
398
|
-
version = "0.1.
|
|
407
|
+
version = "0.1.12"
|
|
399
408
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
400
|
-
checksum = "
|
|
409
|
+
checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
|
|
401
410
|
dependencies = [
|
|
402
411
|
"proc-macro2",
|
|
403
412
|
"quote",
|
|
404
|
-
"syn
|
|
413
|
+
"syn",
|
|
405
414
|
]
|
|
406
415
|
|
|
407
416
|
[[package]]
|
|
@@ -480,18 +489,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
|
480
489
|
|
|
481
490
|
[[package]]
|
|
482
491
|
name = "proc-macro2"
|
|
483
|
-
version = "1.0.
|
|
492
|
+
version = "1.0.81"
|
|
484
493
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
485
|
-
checksum = "
|
|
494
|
+
checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
|
|
486
495
|
dependencies = [
|
|
487
496
|
"unicode-ident",
|
|
488
497
|
]
|
|
489
498
|
|
|
490
499
|
[[package]]
|
|
491
500
|
name = "quote"
|
|
492
|
-
version = "1.0.
|
|
501
|
+
version = "1.0.36"
|
|
493
502
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
494
|
-
checksum = "
|
|
503
|
+
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
|
495
504
|
dependencies = [
|
|
496
505
|
"proc-macro2",
|
|
497
506
|
]
|
|
@@ -528,9 +537,9 @@ dependencies = [
|
|
|
528
537
|
|
|
529
538
|
[[package]]
|
|
530
539
|
name = "rayon"
|
|
531
|
-
version = "1.
|
|
540
|
+
version = "1.10.0"
|
|
532
541
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
533
|
-
checksum = "
|
|
542
|
+
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
|
534
543
|
dependencies = [
|
|
535
544
|
"either",
|
|
536
545
|
"rayon-core",
|
|
@@ -543,15 +552,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
543
552
|
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
|
|
544
553
|
dependencies = [
|
|
545
554
|
"either",
|
|
546
|
-
"itertools",
|
|
555
|
+
"itertools 0.11.0",
|
|
547
556
|
"rayon",
|
|
548
557
|
]
|
|
549
558
|
|
|
550
559
|
[[package]]
|
|
551
560
|
name = "rayon-core"
|
|
552
|
-
version = "1.12.
|
|
561
|
+
version = "1.12.1"
|
|
553
562
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
554
|
-
checksum = "
|
|
563
|
+
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
|
555
564
|
dependencies = [
|
|
556
565
|
"crossbeam-deque",
|
|
557
566
|
"crossbeam-utils",
|
|
@@ -559,18 +568,18 @@ dependencies = [
|
|
|
559
568
|
|
|
560
569
|
[[package]]
|
|
561
570
|
name = "rb-sys"
|
|
562
|
-
version = "0.9.
|
|
571
|
+
version = "0.9.97"
|
|
563
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
564
|
-
checksum = "
|
|
573
|
+
checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
|
|
565
574
|
dependencies = [
|
|
566
575
|
"rb-sys-build",
|
|
567
576
|
]
|
|
568
577
|
|
|
569
578
|
[[package]]
|
|
570
579
|
name = "rb-sys-build"
|
|
571
|
-
version = "0.9.
|
|
580
|
+
version = "0.9.97"
|
|
572
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
573
|
-
checksum = "
|
|
582
|
+
checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
|
|
574
583
|
dependencies = [
|
|
575
584
|
"bindgen",
|
|
576
585
|
"lazy_static",
|
|
@@ -578,7 +587,7 @@ dependencies = [
|
|
|
578
587
|
"quote",
|
|
579
588
|
"regex",
|
|
580
589
|
"shell-words",
|
|
581
|
-
"syn
|
|
590
|
+
"syn",
|
|
582
591
|
]
|
|
583
592
|
|
|
584
593
|
[[package]]
|
|
@@ -589,9 +598,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
|
589
598
|
|
|
590
599
|
[[package]]
|
|
591
600
|
name = "regex"
|
|
592
|
-
version = "1.
|
|
601
|
+
version = "1.10.4"
|
|
593
602
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
594
|
-
checksum = "
|
|
603
|
+
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
|
|
595
604
|
dependencies = [
|
|
596
605
|
"aho-corasick",
|
|
597
606
|
"memchr",
|
|
@@ -601,9 +610,9 @@ dependencies = [
|
|
|
601
610
|
|
|
602
611
|
[[package]]
|
|
603
612
|
name = "regex-automata"
|
|
604
|
-
version = "0.
|
|
613
|
+
version = "0.4.6"
|
|
605
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
606
|
-
checksum = "
|
|
615
|
+
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
|
|
607
616
|
dependencies = [
|
|
608
617
|
"aho-corasick",
|
|
609
618
|
"memchr",
|
|
@@ -612,9 +621,9 @@ dependencies = [
|
|
|
612
621
|
|
|
613
622
|
[[package]]
|
|
614
623
|
name = "regex-syntax"
|
|
615
|
-
version = "0.
|
|
624
|
+
version = "0.8.2"
|
|
616
625
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
617
|
-
checksum = "
|
|
626
|
+
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
|
618
627
|
|
|
619
628
|
[[package]]
|
|
620
629
|
name = "rustc-hash"
|
|
@@ -657,7 +666,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
|
|
|
657
666
|
dependencies = [
|
|
658
667
|
"proc-macro2",
|
|
659
668
|
"quote",
|
|
660
|
-
"syn
|
|
669
|
+
"syn",
|
|
661
670
|
]
|
|
662
671
|
|
|
663
672
|
[[package]]
|
|
@@ -709,20 +718,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|
|
709
718
|
|
|
710
719
|
[[package]]
|
|
711
720
|
name = "syn"
|
|
712
|
-
version = "
|
|
713
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
714
|
-
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
|
715
|
-
dependencies = [
|
|
716
|
-
"proc-macro2",
|
|
717
|
-
"quote",
|
|
718
|
-
"unicode-ident",
|
|
719
|
-
]
|
|
720
|
-
|
|
721
|
-
[[package]]
|
|
722
|
-
name = "syn"
|
|
723
|
-
version = "2.0.38"
|
|
721
|
+
version = "2.0.59"
|
|
724
722
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
725
|
-
checksum = "
|
|
723
|
+
checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
|
|
726
724
|
dependencies = [
|
|
727
725
|
"proc-macro2",
|
|
728
726
|
"quote",
|
|
@@ -746,31 +744,31 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
|
|
|
746
744
|
dependencies = [
|
|
747
745
|
"proc-macro2",
|
|
748
746
|
"quote",
|
|
749
|
-
"syn
|
|
747
|
+
"syn",
|
|
750
748
|
]
|
|
751
749
|
|
|
752
750
|
[[package]]
|
|
753
751
|
name = "tokenizers"
|
|
754
|
-
version = "0.
|
|
752
|
+
version = "0.5.0"
|
|
755
753
|
dependencies = [
|
|
756
754
|
"magnus",
|
|
757
755
|
"onig",
|
|
758
756
|
"serde",
|
|
759
|
-
"tokenizers 0.
|
|
757
|
+
"tokenizers 0.19.1",
|
|
760
758
|
]
|
|
761
759
|
|
|
762
760
|
[[package]]
|
|
763
761
|
name = "tokenizers"
|
|
764
|
-
version = "0.
|
|
762
|
+
version = "0.19.1"
|
|
765
763
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
766
|
-
checksum = "
|
|
764
|
+
checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
|
|
767
765
|
dependencies = [
|
|
768
766
|
"aho-corasick",
|
|
769
767
|
"derive_builder",
|
|
770
768
|
"esaxx-rs",
|
|
771
769
|
"getrandom",
|
|
772
770
|
"indicatif",
|
|
773
|
-
"itertools",
|
|
771
|
+
"itertools 0.12.1",
|
|
774
772
|
"lazy_static",
|
|
775
773
|
"log",
|
|
776
774
|
"macro_rules_attribute",
|
|
@@ -808,9 +806,9 @@ dependencies = [
|
|
|
808
806
|
|
|
809
807
|
[[package]]
|
|
810
808
|
name = "unicode-segmentation"
|
|
811
|
-
version = "1.
|
|
809
|
+
version = "1.11.0"
|
|
812
810
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
813
|
-
checksum = "
|
|
811
|
+
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
|
814
812
|
|
|
815
813
|
[[package]]
|
|
816
814
|
name = "unicode-width"
|
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
:slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
|
|
4
4
|
|
|
5
|
-
[](https://github.com/ankane/tokenizers-ruby/actions)
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.0"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
7
|
-
rust-version = "1.
|
|
7
|
+
rust-version = "1.63.0"
|
|
8
8
|
publish = false
|
|
9
9
|
|
|
10
10
|
[lib]
|
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
|
17
17
|
|
|
18
18
|
[dependencies.tokenizers]
|
|
19
|
-
version = "=0.
|
|
19
|
+
version = "=0.19.1" # also update in from_pretrained.rb
|
|
20
20
|
default-features = false
|
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
|
2
2
|
|
|
3
|
+
use crate::pre_tokenizers::from_string;
|
|
3
4
|
use magnus::value::Lazy;
|
|
4
5
|
use magnus::{
|
|
5
6
|
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
|
|
@@ -11,7 +12,7 @@ use tk::decoders::byte_fallback::ByteFallback;
|
|
|
11
12
|
use tk::decoders::byte_level::ByteLevel;
|
|
12
13
|
use tk::decoders::ctc::CTC;
|
|
13
14
|
use tk::decoders::fuse::Fuse;
|
|
14
|
-
use tk::decoders::metaspace::Metaspace;
|
|
15
|
+
use tk::decoders::metaspace::{Metaspace, PrependScheme};
|
|
15
16
|
use tk::decoders::strip::Strip;
|
|
16
17
|
use tk::decoders::wordpiece::WordPiece;
|
|
17
18
|
use tk::decoders::DecoderWrapper;
|
|
@@ -126,12 +127,29 @@ impl RbDecoder {
|
|
|
126
127
|
setter!(self, Metaspace, @set_replacement, replacement);
|
|
127
128
|
}
|
|
128
129
|
|
|
129
|
-
pub fn
|
|
130
|
-
getter!(self, Metaspace,
|
|
130
|
+
pub fn metaspace_split(&self) -> bool {
|
|
131
|
+
getter!(self, Metaspace, get_split())
|
|
131
132
|
}
|
|
132
133
|
|
|
133
|
-
pub fn
|
|
134
|
-
setter!(self, Metaspace,
|
|
134
|
+
pub fn metaspace_set_split(&self, split: bool) {
|
|
135
|
+
setter!(self, Metaspace, @set_split, split);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
pub fn metaspace_prepend_scheme(&self) -> String {
|
|
139
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
|
140
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
|
141
|
+
match scheme {
|
|
142
|
+
PrependScheme::First => "first",
|
|
143
|
+
PrependScheme::Never => "never",
|
|
144
|
+
PrependScheme::Always => "always",
|
|
145
|
+
}
|
|
146
|
+
.to_string()
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
pub fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
|
150
|
+
let scheme = from_string(prepend_scheme)?;
|
|
151
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
|
152
|
+
Ok(())
|
|
135
153
|
}
|
|
136
154
|
|
|
137
155
|
pub fn word_piece_cleanup(&self) -> bool {
|
|
@@ -194,8 +212,9 @@ impl RbFuse {
|
|
|
194
212
|
pub struct RbMetaspaceDecoder {}
|
|
195
213
|
|
|
196
214
|
impl RbMetaspaceDecoder {
|
|
197
|
-
pub fn new(replacement: char,
|
|
198
|
-
|
|
215
|
+
pub fn new(replacement: char, prepend_scheme: String, split: bool) -> RbResult<RbDecoder> {
|
|
216
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
|
217
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
|
199
218
|
}
|
|
200
219
|
}
|
|
201
220
|
|
|
@@ -364,11 +383,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
364
383
|
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
|
365
384
|
|
|
366
385
|
let class = module.define_class("Metaspace", decoder)?;
|
|
367
|
-
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new,
|
|
368
|
-
class.define_method("
|
|
369
|
-
class.define_method("
|
|
386
|
+
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
|
|
387
|
+
class.define_method("prepend_scheme", method!(RbDecoder::metaspace_prepend_scheme, 0))?;
|
|
388
|
+
class.define_method("prepend_scheme=", method!(RbDecoder::metaspace_set_prepend_scheme, 1))?;
|
|
370
389
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
|
371
390
|
class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
|
|
391
|
+
class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
|
|
392
|
+
class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
|
|
372
393
|
|
|
373
394
|
let class = module.define_class("Replace", decoder)?;
|
|
374
395
|
class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
|
2
2
|
|
|
3
3
|
use magnus::{
|
|
4
|
-
data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object,
|
|
4
|
+
data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
|
|
5
5
|
RArray, RClass, RModule, Ruby, TryConvert, TypedData,
|
|
6
6
|
};
|
|
7
7
|
|
|
@@ -12,7 +12,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
|
|
|
12
12
|
use tk::pre_tokenizers::byte_level::ByteLevel;
|
|
13
13
|
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
|
14
14
|
use tk::pre_tokenizers::digits::Digits;
|
|
15
|
-
use tk::pre_tokenizers::metaspace::Metaspace;
|
|
15
|
+
use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
|
|
16
16
|
use tk::pre_tokenizers::punctuation::Punctuation;
|
|
17
17
|
use tk::pre_tokenizers::split::Split;
|
|
18
18
|
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
|
@@ -118,14 +118,6 @@ impl RbPreTokenizer {
|
|
|
118
118
|
setter!(self, Digits, individual_digits, individual_digits);
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
fn metaspace_add_prefix_space(&self) -> bool {
|
|
122
|
-
getter!(self, Metaspace, add_prefix_space)
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
fn metaspace_set_add_prefix_space(&self, add_prefix_space: bool) {
|
|
126
|
-
setter!(self, Metaspace, add_prefix_space, add_prefix_space);
|
|
127
|
-
}
|
|
128
|
-
|
|
129
121
|
fn metaspace_replacement(&self) -> String {
|
|
130
122
|
getter!(self, Metaspace, get_replacement().to_string())
|
|
131
123
|
}
|
|
@@ -133,6 +125,31 @@ impl RbPreTokenizer {
|
|
|
133
125
|
fn metaspace_set_replacement(&self, replacement: char) {
|
|
134
126
|
setter!(self, Metaspace, @set_replacement, replacement);
|
|
135
127
|
}
|
|
128
|
+
|
|
129
|
+
fn metaspace_split(&self) -> bool {
|
|
130
|
+
getter!(self, Metaspace, get_split())
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
fn metaspace_set_split(&self, split: bool) {
|
|
134
|
+
setter!(self, Metaspace, @set_split, split);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
fn metaspace_prepend_scheme(&self) -> String {
|
|
138
|
+
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
|
139
|
+
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
|
140
|
+
match scheme {
|
|
141
|
+
PrependScheme::First => "first",
|
|
142
|
+
PrependScheme::Never => "never",
|
|
143
|
+
PrependScheme::Always => "always",
|
|
144
|
+
}
|
|
145
|
+
.to_string()
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
fn metaspace_set_prepend_scheme(&self, prepend_scheme: String) -> RbResult<()> {
|
|
149
|
+
let scheme = from_string(prepend_scheme)?;
|
|
150
|
+
setter!(self, Metaspace, @set_prepend_scheme, scheme);
|
|
151
|
+
Ok(())
|
|
152
|
+
}
|
|
136
153
|
}
|
|
137
154
|
|
|
138
155
|
impl PreTokenizer for RbPreTokenizer {
|
|
@@ -180,9 +197,11 @@ pub struct RbMetaspace {}
|
|
|
180
197
|
impl RbMetaspace {
|
|
181
198
|
fn new(
|
|
182
199
|
replacement: char,
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
200
|
+
prepend_scheme: String,
|
|
201
|
+
split: bool,
|
|
202
|
+
) -> RbResult<RbPreTokenizer> {
|
|
203
|
+
let prepend_scheme = from_string(prepend_scheme)?;
|
|
204
|
+
Ok(Metaspace::new(replacement, prepend_scheme, split).into())
|
|
186
205
|
}
|
|
187
206
|
}
|
|
188
207
|
|
|
@@ -252,6 +271,21 @@ impl RbSequence {
|
|
|
252
271
|
}
|
|
253
272
|
}
|
|
254
273
|
|
|
274
|
+
pub(crate) fn from_string(string: String) -> RbResult<PrependScheme> {
|
|
275
|
+
let scheme = match string.as_str() {
|
|
276
|
+
"first" => PrependScheme::First,
|
|
277
|
+
"never" => PrependScheme::Never,
|
|
278
|
+
"always" => PrependScheme::Always,
|
|
279
|
+
_ => {
|
|
280
|
+
return Err(Error::new(exception::arg_error(), format!(
|
|
281
|
+
"{} is an unknown variant, should be one of ['first', 'never', 'always']",
|
|
282
|
+
string
|
|
283
|
+
)));
|
|
284
|
+
}
|
|
285
|
+
};
|
|
286
|
+
Ok(scheme)
|
|
287
|
+
}
|
|
288
|
+
|
|
255
289
|
#[derive(Clone, Deserialize)]
|
|
256
290
|
#[serde(untagged)]
|
|
257
291
|
pub(crate) enum RbPreTokenizerWrapper {
|
|
@@ -465,11 +499,13 @@ pub fn init_pre_tokenizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
465
499
|
class.define_method("individual_digits=", method!(RbPreTokenizer::digits_set_individual_digits, 1))?;
|
|
466
500
|
|
|
467
501
|
let class = module.define_class("Metaspace", pre_tokenizer)?;
|
|
468
|
-
class.define_singleton_method("_new", function!(RbMetaspace::new,
|
|
469
|
-
class.define_method("
|
|
470
|
-
class.define_method("
|
|
502
|
+
class.define_singleton_method("_new", function!(RbMetaspace::new, 3))?;
|
|
503
|
+
class.define_method("prepend_scheme", method!(RbPreTokenizer::metaspace_prepend_scheme, 0))?;
|
|
504
|
+
class.define_method("prepend_scheme=", method!(RbPreTokenizer::metaspace_set_prepend_scheme, 1))?;
|
|
471
505
|
class.define_method("replacement", method!(RbPreTokenizer::metaspace_replacement, 0))?;
|
|
472
506
|
class.define_method("replacement=", method!(RbPreTokenizer::metaspace_set_replacement, 1))?;
|
|
507
|
+
class.define_method("split", method!(RbPreTokenizer::metaspace_split, 0))?;
|
|
508
|
+
class.define_method("split=", method!(RbPreTokenizer::metaspace_set_split, 1))?;
|
|
473
509
|
|
|
474
510
|
let class = module.define_class("Punctuation", pre_tokenizer)?;
|
|
475
511
|
class.define_singleton_method("_new", function!(RbPunctuation::new, 1))?;
|
|
@@ -77,11 +77,11 @@ impl RbTrainer {
|
|
|
77
77
|
setter!(self, BpeTrainer, vocab_size, vocab_size);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
fn bpe_trainer_min_frequency(&self) ->
|
|
80
|
+
fn bpe_trainer_min_frequency(&self) -> u64 {
|
|
81
81
|
getter!(self, BpeTrainer, min_frequency)
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
-
fn bpe_trainer_set_min_frequency(&self, freq:
|
|
84
|
+
fn bpe_trainer_set_min_frequency(&self, freq: u64) {
|
|
85
85
|
setter!(self, BpeTrainer, min_frequency, freq);
|
|
86
86
|
}
|
|
87
87
|
|
|
@@ -235,11 +235,11 @@ impl RbTrainer {
|
|
|
235
235
|
setter!(self, WordLevelTrainer, vocab_size, vocab_size);
|
|
236
236
|
}
|
|
237
237
|
|
|
238
|
-
fn word_level_trainer_min_frequency(&self) ->
|
|
238
|
+
fn word_level_trainer_min_frequency(&self) -> u64 {
|
|
239
239
|
getter!(self, WordLevelTrainer, min_frequency)
|
|
240
240
|
}
|
|
241
241
|
|
|
242
|
-
fn word_level_trainer_set_min_frequency(&self, freq:
|
|
242
|
+
fn word_level_trainer_set_min_frequency(&self, freq: u64) {
|
|
243
243
|
setter!(self, WordLevelTrainer, min_frequency, freq);
|
|
244
244
|
}
|
|
245
245
|
|
|
@@ -289,11 +289,11 @@ impl RbTrainer {
|
|
|
289
289
|
setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
|
|
290
290
|
}
|
|
291
291
|
|
|
292
|
-
fn word_piece_trainer_min_frequency(&self) ->
|
|
292
|
+
fn word_piece_trainer_min_frequency(&self) -> u64 {
|
|
293
293
|
getter!(self, WordPieceTrainer, min_frequency())
|
|
294
294
|
}
|
|
295
295
|
|
|
296
|
-
fn word_piece_trainer_set_min_frequency(&self, freq:
|
|
296
|
+
fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
|
|
297
297
|
setter!(self, WordPieceTrainer, @set_min_frequency, freq);
|
|
298
298
|
}
|
|
299
299
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module Tokenizers
|
|
2
2
|
module Decoders
|
|
3
3
|
class Metaspace
|
|
4
|
-
def self.new(replacement: "\u2581",
|
|
5
|
-
_new(replacement,
|
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
|
5
|
+
_new(replacement, prepend_scheme, split)
|
|
6
6
|
end
|
|
7
7
|
end
|
|
8
8
|
end
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
module Tokenizers
|
|
2
2
|
module PreTokenizers
|
|
3
3
|
class Metaspace
|
|
4
|
-
def self.new(replacement: "\u2581",
|
|
5
|
-
_new(replacement,
|
|
4
|
+
def self.new(replacement: "\u2581", prepend_scheme: "always", split: true)
|
|
5
|
+
_new(replacement, prepend_scheme, split)
|
|
6
6
|
end
|
|
7
7
|
end
|
|
8
8
|
end
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-05-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
93
93
|
requirements:
|
|
94
94
|
- - ">="
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '3'
|
|
96
|
+
version: '3.1'
|
|
97
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
98
|
requirements:
|
|
99
99
|
- - ">="
|
|
100
100
|
- !ruby/object:Gem::Version
|
|
101
101
|
version: '0'
|
|
102
102
|
requirements: []
|
|
103
|
-
rubygems_version: 3.5.
|
|
103
|
+
rubygems_version: 3.5.9
|
|
104
104
|
signing_key:
|
|
105
105
|
specification_version: 4
|
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|