tokenizers 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +125 -90
- data/ext/tokenizers/Cargo.toml +4 -5
- data/ext/tokenizers/src/decoders.rs +100 -1
- data/ext/tokenizers/src/models.rs +15 -0
- data/ext/tokenizers/src/normalizers.rs +28 -2
- data/lib/tokenizers/decoders/strip.rb +9 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/normalizers/prepend.rb +9 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6e88ec5618e36e317434410c960603695806bb59dadb2252f2957d8dbf0525b
|
4
|
+
data.tar.gz: 33a04a4a5faada27e6e7246c16d836a4ff9f6793e89de3cfd4880e30c6c8ed0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88e4f2ad57fd1d66cd5fcf0d8b7ff6b1ea902258296fb02d207a446032134189e3445a104658074e94f914331c94f46cfdd09eed7c745c0483cb3b32b09e6abf
|
7
|
+
data.tar.gz: e8a1721ecbd36874322477077331743b0d1ba2de6f90076e07ad5456c230f76625d7f28ed6e6026c11395c6bb27701a6b8c0feedf2050387d32d9b777baa51fe
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -71,9 +71,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
71
71
|
|
72
72
|
[[package]]
|
73
73
|
name = "clang-sys"
|
74
|
-
version = "1.
|
74
|
+
version = "1.6.1"
|
75
75
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
76
|
-
checksum = "
|
76
|
+
checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
|
77
77
|
dependencies = [
|
78
78
|
"glob",
|
79
79
|
"libc",
|
@@ -95,9 +95,9 @@ dependencies = [
|
|
95
95
|
|
96
96
|
[[package]]
|
97
97
|
name = "crossbeam-channel"
|
98
|
-
version = "0.5.
|
98
|
+
version = "0.5.8"
|
99
99
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
100
|
-
checksum = "
|
100
|
+
checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
|
101
101
|
dependencies = [
|
102
102
|
"cfg-if",
|
103
103
|
"crossbeam-utils",
|
@@ -105,9 +105,9 @@ dependencies = [
|
|
105
105
|
|
106
106
|
[[package]]
|
107
107
|
name = "crossbeam-deque"
|
108
|
-
version = "0.8.
|
108
|
+
version = "0.8.3"
|
109
109
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
110
|
-
checksum = "
|
110
|
+
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
111
111
|
dependencies = [
|
112
112
|
"cfg-if",
|
113
113
|
"crossbeam-epoch",
|
@@ -116,9 +116,9 @@ dependencies = [
|
|
116
116
|
|
117
117
|
[[package]]
|
118
118
|
name = "crossbeam-epoch"
|
119
|
-
version = "0.9.
|
119
|
+
version = "0.9.14"
|
120
120
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
121
|
-
checksum = "
|
121
|
+
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
|
122
122
|
dependencies = [
|
123
123
|
"autocfg",
|
124
124
|
"cfg-if",
|
@@ -129,18 +129,18 @@ dependencies = [
|
|
129
129
|
|
130
130
|
[[package]]
|
131
131
|
name = "crossbeam-utils"
|
132
|
-
version = "0.8.
|
132
|
+
version = "0.8.15"
|
133
133
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
134
|
-
checksum = "
|
134
|
+
checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
|
135
135
|
dependencies = [
|
136
136
|
"cfg-if",
|
137
137
|
]
|
138
138
|
|
139
139
|
[[package]]
|
140
140
|
name = "darling"
|
141
|
-
version = "0.14.
|
141
|
+
version = "0.14.4"
|
142
142
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
143
|
-
checksum = "
|
143
|
+
checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
|
144
144
|
dependencies = [
|
145
145
|
"darling_core",
|
146
146
|
"darling_macro",
|
@@ -148,27 +148,27 @@ dependencies = [
|
|
148
148
|
|
149
149
|
[[package]]
|
150
150
|
name = "darling_core"
|
151
|
-
version = "0.14.
|
151
|
+
version = "0.14.4"
|
152
152
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
153
|
-
checksum = "
|
153
|
+
checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
|
154
154
|
dependencies = [
|
155
155
|
"fnv",
|
156
156
|
"ident_case",
|
157
157
|
"proc-macro2",
|
158
158
|
"quote",
|
159
159
|
"strsim",
|
160
|
-
"syn",
|
160
|
+
"syn 1.0.109",
|
161
161
|
]
|
162
162
|
|
163
163
|
[[package]]
|
164
164
|
name = "darling_macro"
|
165
|
-
version = "0.14.
|
165
|
+
version = "0.14.4"
|
166
166
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
167
|
-
checksum = "
|
167
|
+
checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
|
168
168
|
dependencies = [
|
169
169
|
"darling_core",
|
170
170
|
"quote",
|
171
|
-
"syn",
|
171
|
+
"syn 1.0.109",
|
172
172
|
]
|
173
173
|
|
174
174
|
[[package]]
|
@@ -189,7 +189,7 @@ dependencies = [
|
|
189
189
|
"darling",
|
190
190
|
"proc-macro2",
|
191
191
|
"quote",
|
192
|
-
"syn",
|
192
|
+
"syn 1.0.109",
|
193
193
|
]
|
194
194
|
|
195
195
|
[[package]]
|
@@ -199,7 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
199
199
|
checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
|
200
200
|
dependencies = [
|
201
201
|
"derive_builder_core",
|
202
|
-
"syn",
|
202
|
+
"syn 1.0.109",
|
203
203
|
]
|
204
204
|
|
205
205
|
[[package]]
|
@@ -231,9 +231,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
231
231
|
|
232
232
|
[[package]]
|
233
233
|
name = "getrandom"
|
234
|
-
version = "0.2.
|
234
|
+
version = "0.2.9"
|
235
235
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
236
|
-
checksum = "
|
236
|
+
checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
|
237
237
|
dependencies = [
|
238
238
|
"cfg-if",
|
239
239
|
"libc",
|
@@ -293,9 +293,9 @@ dependencies = [
|
|
293
293
|
|
294
294
|
[[package]]
|
295
295
|
name = "itoa"
|
296
|
-
version = "1.0.
|
296
|
+
version = "1.0.6"
|
297
297
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
298
|
-
checksum = "
|
298
|
+
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
299
299
|
|
300
300
|
[[package]]
|
301
301
|
name = "lazy_static"
|
@@ -311,9 +311,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
311
311
|
|
312
312
|
[[package]]
|
313
313
|
name = "libc"
|
314
|
-
version = "0.2.
|
314
|
+
version = "0.2.141"
|
315
315
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
316
|
-
checksum = "
|
316
|
+
checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
|
317
317
|
|
318
318
|
[[package]]
|
319
319
|
name = "libloading"
|
@@ -352,9 +352,9 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
352
352
|
|
353
353
|
[[package]]
|
354
354
|
name = "magnus"
|
355
|
-
version = "0.5.
|
355
|
+
version = "0.5.3"
|
356
356
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
357
|
-
checksum = "
|
357
|
+
checksum = "c8dc14463c2552e753ef562961f486ca76f17a857c121db40e9f3ade3f35ab81"
|
358
358
|
dependencies = [
|
359
359
|
"magnus-macros",
|
360
360
|
"rb-sys",
|
@@ -363,13 +363,13 @@ dependencies = [
|
|
363
363
|
|
364
364
|
[[package]]
|
365
365
|
name = "magnus-macros"
|
366
|
-
version = "0.4.
|
366
|
+
version = "0.4.1"
|
367
367
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
368
|
-
checksum = "
|
368
|
+
checksum = "6cc17af1d45442c011aa579d727ec6cff8a69aea8a6bbad26736e7112d749bfb"
|
369
369
|
dependencies = [
|
370
370
|
"proc-macro2",
|
371
371
|
"quote",
|
372
|
-
"syn",
|
372
|
+
"syn 1.0.109",
|
373
373
|
]
|
374
374
|
|
375
375
|
[[package]]
|
@@ -380,9 +380,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
|
380
380
|
|
381
381
|
[[package]]
|
382
382
|
name = "memoffset"
|
383
|
-
version = "0.
|
383
|
+
version = "0.8.0"
|
384
384
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
385
|
-
checksum = "
|
385
|
+
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
|
386
386
|
dependencies = [
|
387
387
|
"autocfg",
|
388
388
|
]
|
@@ -393,6 +393,27 @@ version = "0.2.1"
|
|
393
393
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
394
394
|
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
395
395
|
|
396
|
+
[[package]]
|
397
|
+
name = "monostate"
|
398
|
+
version = "0.1.6"
|
399
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
400
|
+
checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
|
401
|
+
dependencies = [
|
402
|
+
"monostate-impl",
|
403
|
+
"serde",
|
404
|
+
]
|
405
|
+
|
406
|
+
[[package]]
|
407
|
+
name = "monostate-impl"
|
408
|
+
version = "0.1.6"
|
409
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
410
|
+
checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
|
411
|
+
dependencies = [
|
412
|
+
"proc-macro2",
|
413
|
+
"quote",
|
414
|
+
"syn 2.0.13",
|
415
|
+
]
|
416
|
+
|
396
417
|
[[package]]
|
397
418
|
name = "nom"
|
398
419
|
version = "7.1.3"
|
@@ -421,9 +442,9 @@ checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
|
|
421
442
|
|
422
443
|
[[package]]
|
423
444
|
name = "once_cell"
|
424
|
-
version = "1.17.
|
445
|
+
version = "1.17.1"
|
425
446
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
426
|
-
checksum = "
|
447
|
+
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
427
448
|
|
428
449
|
[[package]]
|
429
450
|
name = "onig"
|
@@ -449,9 +470,9 @@ dependencies = [
|
|
449
470
|
|
450
471
|
[[package]]
|
451
472
|
name = "paste"
|
452
|
-
version = "1.0.
|
473
|
+
version = "1.0.12"
|
453
474
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
454
|
-
checksum = "
|
475
|
+
checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
|
455
476
|
|
456
477
|
[[package]]
|
457
478
|
name = "peeking_take_while"
|
@@ -473,18 +494,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
473
494
|
|
474
495
|
[[package]]
|
475
496
|
name = "proc-macro2"
|
476
|
-
version = "1.0.
|
497
|
+
version = "1.0.56"
|
477
498
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
478
|
-
checksum = "
|
499
|
+
checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
|
479
500
|
dependencies = [
|
480
501
|
"unicode-ident",
|
481
502
|
]
|
482
503
|
|
483
504
|
[[package]]
|
484
505
|
name = "quote"
|
485
|
-
version = "1.0.
|
506
|
+
version = "1.0.26"
|
486
507
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
487
|
-
checksum = "
|
508
|
+
checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
|
488
509
|
dependencies = [
|
489
510
|
"proc-macro2",
|
490
511
|
]
|
@@ -521,9 +542,9 @@ dependencies = [
|
|
521
542
|
|
522
543
|
[[package]]
|
523
544
|
name = "rayon"
|
524
|
-
version = "1.
|
545
|
+
version = "1.7.0"
|
525
546
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
526
|
-
checksum = "
|
547
|
+
checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
|
527
548
|
dependencies = [
|
528
549
|
"either",
|
529
550
|
"rayon-core",
|
@@ -542,9 +563,9 @@ dependencies = [
|
|
542
563
|
|
543
564
|
[[package]]
|
544
565
|
name = "rayon-core"
|
545
|
-
version = "1.
|
566
|
+
version = "1.11.0"
|
546
567
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
547
|
-
checksum = "
|
568
|
+
checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
|
548
569
|
dependencies = [
|
549
570
|
"crossbeam-channel",
|
550
571
|
"crossbeam-deque",
|
@@ -554,25 +575,26 @@ dependencies = [
|
|
554
575
|
|
555
576
|
[[package]]
|
556
577
|
name = "rb-sys"
|
557
|
-
version = "0.9.
|
578
|
+
version = "0.9.71"
|
558
579
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
559
|
-
checksum = "
|
580
|
+
checksum = "156bfedced1e236600bcaad538477097ff2ed5c6b474e411d15b791e1d24c0f1"
|
560
581
|
dependencies = [
|
561
582
|
"rb-sys-build",
|
562
583
|
]
|
563
584
|
|
564
585
|
[[package]]
|
565
586
|
name = "rb-sys-build"
|
566
|
-
version = "0.9.
|
587
|
+
version = "0.9.71"
|
567
588
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
568
|
-
checksum = "
|
589
|
+
checksum = "5cb2e4a32cbc290b543a74567072ad24b708aff7bb5dde5a68d5690379cd7938"
|
569
590
|
dependencies = [
|
570
591
|
"bindgen",
|
571
592
|
"lazy_static",
|
593
|
+
"proc-macro2",
|
572
594
|
"quote",
|
573
595
|
"regex",
|
574
596
|
"shell-words",
|
575
|
-
"syn",
|
597
|
+
"syn 1.0.109",
|
576
598
|
]
|
577
599
|
|
578
600
|
[[package]]
|
@@ -583,9 +605,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
583
605
|
|
584
606
|
[[package]]
|
585
607
|
name = "regex"
|
586
|
-
version = "1.7.
|
608
|
+
version = "1.7.3"
|
587
609
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
588
|
-
checksum = "
|
610
|
+
checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
|
589
611
|
dependencies = [
|
590
612
|
"aho-corasick",
|
591
613
|
"memchr",
|
@@ -594,9 +616,9 @@ dependencies = [
|
|
594
616
|
|
595
617
|
[[package]]
|
596
618
|
name = "regex-syntax"
|
597
|
-
version = "0.6.
|
619
|
+
version = "0.6.29"
|
598
620
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
599
|
-
checksum = "
|
621
|
+
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
600
622
|
|
601
623
|
[[package]]
|
602
624
|
name = "rustc-hash"
|
@@ -606,9 +628,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
606
628
|
|
607
629
|
[[package]]
|
608
630
|
name = "ryu"
|
609
|
-
version = "1.0.
|
631
|
+
version = "1.0.13"
|
610
632
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
611
|
-
checksum = "
|
633
|
+
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
|
612
634
|
|
613
635
|
[[package]]
|
614
636
|
name = "scopeguard"
|
@@ -618,29 +640,29 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
|
618
640
|
|
619
641
|
[[package]]
|
620
642
|
name = "serde"
|
621
|
-
version = "1.0.
|
643
|
+
version = "1.0.159"
|
622
644
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
623
|
-
checksum = "
|
645
|
+
checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
|
624
646
|
dependencies = [
|
625
647
|
"serde_derive",
|
626
648
|
]
|
627
649
|
|
628
650
|
[[package]]
|
629
651
|
name = "serde_derive"
|
630
|
-
version = "1.0.
|
652
|
+
version = "1.0.159"
|
631
653
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
632
|
-
checksum = "
|
654
|
+
checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
|
633
655
|
dependencies = [
|
634
656
|
"proc-macro2",
|
635
657
|
"quote",
|
636
|
-
"syn",
|
658
|
+
"syn 2.0.13",
|
637
659
|
]
|
638
660
|
|
639
661
|
[[package]]
|
640
662
|
name = "serde_json"
|
641
|
-
version = "1.0.
|
663
|
+
version = "1.0.95"
|
642
664
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
643
|
-
checksum = "
|
665
|
+
checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
|
644
666
|
dependencies = [
|
645
667
|
"itoa",
|
646
668
|
"ryu",
|
@@ -685,9 +707,20 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|
685
707
|
|
686
708
|
[[package]]
|
687
709
|
name = "syn"
|
688
|
-
version = "1.0.
|
710
|
+
version = "1.0.109"
|
689
711
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
690
|
-
checksum = "
|
712
|
+
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
713
|
+
dependencies = [
|
714
|
+
"proc-macro2",
|
715
|
+
"quote",
|
716
|
+
"unicode-ident",
|
717
|
+
]
|
718
|
+
|
719
|
+
[[package]]
|
720
|
+
name = "syn"
|
721
|
+
version = "2.0.13"
|
722
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
723
|
+
checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec"
|
691
724
|
dependencies = [
|
692
725
|
"proc-macro2",
|
693
726
|
"quote",
|
@@ -696,38 +729,39 @@ dependencies = [
|
|
696
729
|
|
697
730
|
[[package]]
|
698
731
|
name = "thiserror"
|
699
|
-
version = "1.0.
|
732
|
+
version = "1.0.40"
|
700
733
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
701
|
-
checksum = "
|
734
|
+
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
|
702
735
|
dependencies = [
|
703
736
|
"thiserror-impl",
|
704
737
|
]
|
705
738
|
|
706
739
|
[[package]]
|
707
740
|
name = "thiserror-impl"
|
708
|
-
version = "1.0.
|
741
|
+
version = "1.0.40"
|
709
742
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
710
|
-
checksum = "
|
743
|
+
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
|
711
744
|
dependencies = [
|
712
745
|
"proc-macro2",
|
713
746
|
"quote",
|
714
|
-
"syn",
|
747
|
+
"syn 2.0.13",
|
715
748
|
]
|
716
749
|
|
717
750
|
[[package]]
|
718
751
|
name = "tokenizers"
|
719
|
-
version = "0.3.
|
752
|
+
version = "0.3.3"
|
720
753
|
dependencies = [
|
721
754
|
"magnus",
|
722
755
|
"onig",
|
723
756
|
"serde",
|
724
|
-
"tokenizers 0.13.
|
757
|
+
"tokenizers 0.13.3",
|
725
758
|
]
|
726
759
|
|
727
760
|
[[package]]
|
728
761
|
name = "tokenizers"
|
729
|
-
version = "0.13.
|
730
|
-
source = "
|
762
|
+
version = "0.13.3"
|
763
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
764
|
+
checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
|
731
765
|
dependencies = [
|
732
766
|
"aho-corasick",
|
733
767
|
"derive_builder",
|
@@ -738,6 +772,7 @@ dependencies = [
|
|
738
772
|
"lazy_static",
|
739
773
|
"log",
|
740
774
|
"macro_rules_attribute",
|
775
|
+
"monostate",
|
741
776
|
"onig",
|
742
777
|
"paste",
|
743
778
|
"rand",
|
@@ -756,9 +791,9 @@ dependencies = [
|
|
756
791
|
|
757
792
|
[[package]]
|
758
793
|
name = "unicode-ident"
|
759
|
-
version = "1.0.
|
794
|
+
version = "1.0.8"
|
760
795
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
761
|
-
checksum = "
|
796
|
+
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
|
762
797
|
|
763
798
|
[[package]]
|
764
799
|
name = "unicode-normalization-alignments"
|
@@ -832,42 +867,42 @@ dependencies = [
|
|
832
867
|
|
833
868
|
[[package]]
|
834
869
|
name = "windows_aarch64_gnullvm"
|
835
|
-
version = "0.42.
|
870
|
+
version = "0.42.2"
|
836
871
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
837
|
-
checksum = "
|
872
|
+
checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
|
838
873
|
|
839
874
|
[[package]]
|
840
875
|
name = "windows_aarch64_msvc"
|
841
|
-
version = "0.42.
|
876
|
+
version = "0.42.2"
|
842
877
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
843
|
-
checksum = "
|
878
|
+
checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
|
844
879
|
|
845
880
|
[[package]]
|
846
881
|
name = "windows_i686_gnu"
|
847
|
-
version = "0.42.
|
882
|
+
version = "0.42.2"
|
848
883
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
849
|
-
checksum = "
|
884
|
+
checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
|
850
885
|
|
851
886
|
[[package]]
|
852
887
|
name = "windows_i686_msvc"
|
853
|
-
version = "0.42.
|
888
|
+
version = "0.42.2"
|
854
889
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
855
|
-
checksum = "
|
890
|
+
checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
|
856
891
|
|
857
892
|
[[package]]
|
858
893
|
name = "windows_x86_64_gnu"
|
859
|
-
version = "0.42.
|
894
|
+
version = "0.42.2"
|
860
895
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
861
|
-
checksum = "
|
896
|
+
checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
|
862
897
|
|
863
898
|
[[package]]
|
864
899
|
name = "windows_x86_64_gnullvm"
|
865
|
-
version = "0.42.
|
900
|
+
version = "0.42.2"
|
866
901
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
867
|
-
checksum = "
|
902
|
+
checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
|
868
903
|
|
869
904
|
[[package]]
|
870
905
|
name = "windows_x86_64_msvc"
|
871
|
-
version = "0.42.
|
906
|
+
version = "0.42.2"
|
872
907
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
873
|
-
checksum = "
|
908
|
+
checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.3"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,11 +11,10 @@ crate-type = ["cdylib"]
|
|
11
11
|
|
12
12
|
[dependencies]
|
13
13
|
magnus = "0.5"
|
14
|
-
onig = { version = "6
|
15
|
-
serde = { version = "1
|
14
|
+
onig = { version = "6", default-features = false }
|
15
|
+
serde = { version = "1", features = ["rc", "derive"] }
|
16
16
|
|
17
17
|
[dependencies.tokenizers]
|
18
|
-
version = "0.13.
|
19
|
-
git = "https://github.com/huggingface/tokenizers"
|
18
|
+
version = "=0.13.3" # also update in from_pretrained.rb
|
20
19
|
default-features = false
|
21
20
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -7,14 +7,19 @@ use magnus::{
|
|
7
7
|
};
|
8
8
|
use serde::{Deserialize, Serialize};
|
9
9
|
use tk::decoders::bpe::BPEDecoder;
|
10
|
+
use tk::decoders::byte_fallback::ByteFallback;
|
10
11
|
use tk::decoders::byte_level::ByteLevel;
|
11
12
|
use tk::decoders::ctc::CTC;
|
13
|
+
use tk::decoders::fuse::Fuse;
|
12
14
|
use tk::decoders::metaspace::Metaspace;
|
15
|
+
use tk::decoders::strip::Strip;
|
13
16
|
use tk::decoders::wordpiece::WordPiece;
|
14
17
|
use tk::decoders::DecoderWrapper;
|
15
18
|
use tk::Decoder;
|
19
|
+
use tk::normalizers::replace::Replace;
|
16
20
|
|
17
|
-
use super::
|
21
|
+
use super::utils::*;
|
22
|
+
use super::{RbError, RbResult};
|
18
23
|
|
19
24
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
20
25
|
pub struct RbDecoder {
|
@@ -89,6 +94,30 @@ impl RbDecoder {
|
|
89
94
|
setter!(self, CTC, word_delimiter_token, word_delimiter_token);
|
90
95
|
}
|
91
96
|
|
97
|
+
fn strip_content(&self) -> char {
|
98
|
+
getter!(self, Strip, content)
|
99
|
+
}
|
100
|
+
|
101
|
+
fn strip_set_content(&self, content: char) {
|
102
|
+
setter!(self, Strip, content, content)
|
103
|
+
}
|
104
|
+
|
105
|
+
fn strip_start(&self) -> usize {
|
106
|
+
getter!(self, Strip, start)
|
107
|
+
}
|
108
|
+
|
109
|
+
fn strip_set_start(&self, start: usize) {
|
110
|
+
setter!(self, Strip, start, start)
|
111
|
+
}
|
112
|
+
|
113
|
+
fn strip_stop(&self) -> usize {
|
114
|
+
getter!(self, Strip, stop)
|
115
|
+
}
|
116
|
+
|
117
|
+
fn strip_set_stop(&self, stop: usize) {
|
118
|
+
setter!(self, Strip, stop, stop)
|
119
|
+
}
|
120
|
+
|
92
121
|
pub fn metaspace_replacement(&self) -> char {
|
93
122
|
getter!(self, Metaspace, get_replacement().clone())
|
94
123
|
}
|
@@ -130,6 +159,14 @@ impl RbBPEDecoder {
|
|
130
159
|
}
|
131
160
|
}
|
132
161
|
|
162
|
+
pub struct RbByteFallbackDecoder {}
|
163
|
+
|
164
|
+
impl RbByteFallbackDecoder {
|
165
|
+
pub fn new() -> RbDecoder {
|
166
|
+
ByteFallback::default().into()
|
167
|
+
}
|
168
|
+
}
|
169
|
+
|
133
170
|
pub struct RbByteLevelDecoder {}
|
134
171
|
|
135
172
|
impl RbByteLevelDecoder {
|
@@ -146,6 +183,14 @@ impl RbCTC {
|
|
146
183
|
}
|
147
184
|
}
|
148
185
|
|
186
|
+
pub struct RbFuse {}
|
187
|
+
|
188
|
+
impl RbFuse {
|
189
|
+
pub fn new() -> RbDecoder {
|
190
|
+
Fuse::default().into()
|
191
|
+
}
|
192
|
+
}
|
193
|
+
|
149
194
|
pub struct RbMetaspaceDecoder {}
|
150
195
|
|
151
196
|
impl RbMetaspaceDecoder {
|
@@ -154,6 +199,22 @@ impl RbMetaspaceDecoder {
|
|
154
199
|
}
|
155
200
|
}
|
156
201
|
|
202
|
+
pub struct RbReplaceDecoder {}
|
203
|
+
|
204
|
+
impl RbReplaceDecoder {
|
205
|
+
pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
|
206
|
+
Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
|
207
|
+
}
|
208
|
+
}
|
209
|
+
|
210
|
+
pub struct RbStripDecoder {}
|
211
|
+
|
212
|
+
impl RbStripDecoder {
|
213
|
+
pub fn new(content: char, start: usize, stop: usize) -> RbDecoder {
|
214
|
+
Strip::new(content, start, stop).into()
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
157
218
|
pub struct RbWordPieceDecoder {}
|
158
219
|
|
159
220
|
impl RbWordPieceDecoder {
|
@@ -219,6 +280,11 @@ unsafe impl TypedData for RbDecoder {
|
|
219
280
|
class.undef_alloc_func();
|
220
281
|
class
|
221
282
|
}),
|
283
|
+
DecoderWrapper::ByteFallback(_) => *memoize!(RClass: {
|
284
|
+
let class: RClass = crate::decoders().const_get("ByteFallback").unwrap();
|
285
|
+
class.undef_alloc_func();
|
286
|
+
class
|
287
|
+
}),
|
222
288
|
DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
|
223
289
|
let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
|
224
290
|
class.undef_alloc_func();
|
@@ -229,11 +295,26 @@ unsafe impl TypedData for RbDecoder {
|
|
229
295
|
class.undef_alloc_func();
|
230
296
|
class
|
231
297
|
}),
|
298
|
+
DecoderWrapper::Fuse(_) => *memoize!(RClass: {
|
299
|
+
let class: RClass = crate::decoders().const_get("Fuse").unwrap();
|
300
|
+
class.undef_alloc_func();
|
301
|
+
class
|
302
|
+
}),
|
232
303
|
DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
|
233
304
|
let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
|
234
305
|
class.undef_alloc_func();
|
235
306
|
class
|
236
307
|
}),
|
308
|
+
DecoderWrapper::Replace(_) => *memoize!(RClass: {
|
309
|
+
let class: RClass = crate::decoders().const_get("Replace").unwrap();
|
310
|
+
class.undef_alloc_func();
|
311
|
+
class
|
312
|
+
}),
|
313
|
+
DecoderWrapper::Strip(_) => *memoize!(RClass: {
|
314
|
+
let class: RClass = crate::decoders().const_get("Strip").unwrap();
|
315
|
+
class.undef_alloc_func();
|
316
|
+
class
|
317
|
+
}),
|
237
318
|
DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
|
238
319
|
let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
|
239
320
|
class.undef_alloc_func();
|
@@ -253,6 +334,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
253
334
|
class.define_method("suffix", method!(RbDecoder::bpe_suffix, 0))?;
|
254
335
|
class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
|
255
336
|
|
337
|
+
let class = module.define_class("ByteFallback", decoder)?;
|
338
|
+
class.define_singleton_method("new", function!(RbByteFallbackDecoder::new, 0))?;
|
339
|
+
|
256
340
|
let class = module.define_class("ByteLevel", decoder)?;
|
257
341
|
class.define_singleton_method("new", function!(RbByteLevelDecoder::new, 0))?;
|
258
342
|
|
@@ -265,6 +349,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
265
349
|
class.define_method("word_delimiter_token", method!(RbDecoder::ctc_word_delimiter_token, 0))?;
|
266
350
|
class.define_method("word_delimiter_token=", method!(RbDecoder::ctc_set_word_delimiter_token, 1))?;
|
267
351
|
|
352
|
+
let class = module.define_class("Fuse", decoder)?;
|
353
|
+
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
354
|
+
|
268
355
|
let class = module.define_class("Metaspace", decoder)?;
|
269
356
|
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
|
270
357
|
class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
|
@@ -272,6 +359,18 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
272
359
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
273
360
|
class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
|
274
361
|
|
362
|
+
let class = module.define_class("Replace", decoder)?;
|
363
|
+
class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
|
364
|
+
|
365
|
+
let class = module.define_class("Strip", decoder)?;
|
366
|
+
class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
|
367
|
+
class.define_method("content", method!(RbDecoder::strip_content, 0))?;
|
368
|
+
class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
|
369
|
+
class.define_method("start", method!(RbDecoder::strip_start, 0))?;
|
370
|
+
class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
|
371
|
+
class.define_method("stop", method!(RbDecoder::strip_stop, 0))?;
|
372
|
+
class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
|
373
|
+
|
275
374
|
let class = module.define_class("WordPiece", decoder)?;
|
276
375
|
class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
|
277
376
|
class.define_method("cleanup", method!(RbDecoder::word_piece_cleanup, 0))?;
|
@@ -101,6 +101,11 @@ impl RbBPE {
|
|
101
101
|
builder = builder.fuse_unk(value.try_convert()?);
|
102
102
|
}
|
103
103
|
|
104
|
+
let value: Value = kwargs.delete(Symbol::new("byte_fallback"))?;
|
105
|
+
if !value.is_nil() {
|
106
|
+
builder = builder.byte_fallback(value.try_convert()?);
|
107
|
+
}
|
108
|
+
|
104
109
|
if !kwargs.is_empty() {
|
105
110
|
// TODO improve message
|
106
111
|
return Err(Error::new(exception::arg_error(), "unknown keyword"));
|
@@ -169,6 +174,14 @@ impl RbModel {
|
|
169
174
|
setter!(self, BPE, fuse_unk, fuse_unk);
|
170
175
|
}
|
171
176
|
|
177
|
+
pub fn bpe_byte_fallback(&self) -> bool {
|
178
|
+
getter!(self, BPE, byte_fallback)
|
179
|
+
}
|
180
|
+
|
181
|
+
pub fn bpe_set_byte_fallback(&self, byte_fallback: bool) {
|
182
|
+
setter!(self, BPE, byte_fallback, byte_fallback);
|
183
|
+
}
|
184
|
+
|
172
185
|
pub fn bpe_continuing_subword_prefix(&self) -> Option<String> {
|
173
186
|
getter!(self, BPE, continuing_subword_prefix.clone())
|
174
187
|
}
|
@@ -355,6 +368,8 @@ pub fn models(module: &RModule) -> RbResult<()> {
|
|
355
368
|
class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
|
356
369
|
class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
|
357
370
|
class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
|
371
|
+
class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
|
372
|
+
class.define_method("byte_fallback=", method!(RbModel::bpe_set_byte_fallback, 1))?;
|
358
373
|
|
359
374
|
let class = module.define_class("Unigram", model)?;
|
360
375
|
class.define_singleton_method("_new", function!(RbUnigram::new, 2))?;
|
@@ -8,7 +8,7 @@ use magnus::{
|
|
8
8
|
use serde::ser::SerializeStruct;
|
9
9
|
use serde::{Deserialize, Serialize, Serializer};
|
10
10
|
use tk::normalizers::{
|
11
|
-
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Strip, StripAccents,
|
11
|
+
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
|
12
12
|
NFC, NFD, NFKC, NFKD,
|
13
13
|
};
|
14
14
|
use tk::{NormalizedString, Normalizer};
|
@@ -44,7 +44,7 @@ macro_rules! getter {
|
|
44
44
|
($self: ident, $variant: ident, $name: ident) => {{
|
45
45
|
if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
|
46
46
|
let wrapper = norm.read().unwrap();
|
47
|
-
if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper {
|
47
|
+
if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
|
48
48
|
o.$name
|
49
49
|
} else {
|
50
50
|
unreachable!()
|
@@ -105,6 +105,14 @@ impl RbNormalizer {
|
|
105
105
|
setter!(self, BertNormalizer, lowercase, lowercase)
|
106
106
|
}
|
107
107
|
|
108
|
+
fn prepend_prepend(&self) -> String {
|
109
|
+
getter!(self, Prepend, prepend)
|
110
|
+
}
|
111
|
+
|
112
|
+
fn prepend_set_prepend(&self, prepend: String) {
|
113
|
+
setter!(self, Prepend, prepend, prepend)
|
114
|
+
}
|
115
|
+
|
108
116
|
fn strip_left(&self) -> bool {
|
109
117
|
getter!(self, StripNormalizer, strip_left)
|
110
118
|
}
|
@@ -186,6 +194,14 @@ impl RbReplace {
|
|
186
194
|
}
|
187
195
|
}
|
188
196
|
|
197
|
+
pub struct RbPrepend {}
|
198
|
+
|
199
|
+
impl RbPrepend {
|
200
|
+
pub fn new(prepend: String) -> RbNormalizer {
|
201
|
+
Prepend::new(prepend).into()
|
202
|
+
}
|
203
|
+
}
|
204
|
+
|
189
205
|
pub struct RbStrip {}
|
190
206
|
|
191
207
|
impl RbStrip {
|
@@ -372,6 +388,11 @@ unsafe impl TypedData for RbNormalizer {
|
|
372
388
|
class.undef_alloc_func();
|
373
389
|
class
|
374
390
|
}),
|
391
|
+
NormalizerWrapper::Prepend(_) => *memoize!(RClass: {
|
392
|
+
let class: RClass = crate::normalizers().const_get("Prepend").unwrap();
|
393
|
+
class.undef_alloc_func();
|
394
|
+
class
|
395
|
+
}),
|
375
396
|
NormalizerWrapper::StripNormalizer(_) => *memoize!(RClass: {
|
376
397
|
let class: RClass = crate::normalizers().const_get("Strip").unwrap();
|
377
398
|
class.undef_alloc_func();
|
@@ -428,6 +449,11 @@ pub fn normalizers(module: &RModule) -> RbResult<()> {
|
|
428
449
|
let class = module.define_class("Replace", normalizer)?;
|
429
450
|
class.define_singleton_method("new", function!(RbReplace::new, 2))?;
|
430
451
|
|
452
|
+
let class = module.define_class("Prepend", normalizer)?;
|
453
|
+
class.define_singleton_method("_new", function!(RbPrepend::new, 1))?;
|
454
|
+
class.define_method("prepend", method!(RbNormalizer::prepend_prepend, 0))?;
|
455
|
+
class.define_method("prepend=", method!(RbNormalizer::prepend_set_prepend, 1))?;
|
456
|
+
|
431
457
|
let class = module.define_class("Strip", normalizer)?;
|
432
458
|
class.define_singleton_method("_new", function!(RbStrip::new, 2))?;
|
433
459
|
class.define_method("left", method!(RbNormalizer::strip_left, 0))?;
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -9,6 +9,7 @@ end
|
|
9
9
|
require_relative "tokenizers/decoders/bpe_decoder"
|
10
10
|
require_relative "tokenizers/decoders/ctc"
|
11
11
|
require_relative "tokenizers/decoders/metaspace"
|
12
|
+
require_relative "tokenizers/decoders/strip"
|
12
13
|
require_relative "tokenizers/decoders/word_piece"
|
13
14
|
|
14
15
|
# models
|
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
|
|
19
20
|
|
20
21
|
# normalizers
|
21
22
|
require_relative "tokenizers/normalizers/bert_normalizer"
|
23
|
+
require_relative "tokenizers/normalizers/prepend"
|
22
24
|
require_relative "tokenizers/normalizers/strip"
|
23
25
|
|
24
26
|
# pre-tokenizers
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-04-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -56,6 +56,7 @@ files:
|
|
56
56
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
57
57
|
- lib/tokenizers/decoders/ctc.rb
|
58
58
|
- lib/tokenizers/decoders/metaspace.rb
|
59
|
+
- lib/tokenizers/decoders/strip.rb
|
59
60
|
- lib/tokenizers/decoders/word_piece.rb
|
60
61
|
- lib/tokenizers/encoding.rb
|
61
62
|
- lib/tokenizers/from_pretrained.rb
|
@@ -64,6 +65,7 @@ files:
|
|
64
65
|
- lib/tokenizers/models/word_level.rb
|
65
66
|
- lib/tokenizers/models/word_piece.rb
|
66
67
|
- lib/tokenizers/normalizers/bert_normalizer.rb
|
68
|
+
- lib/tokenizers/normalizers/prepend.rb
|
67
69
|
- lib/tokenizers/normalizers/strip.rb
|
68
70
|
- lib/tokenizers/pre_tokenizers/byte_level.rb
|
69
71
|
- lib/tokenizers/pre_tokenizers/digits.rb
|
@@ -98,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
100
|
- !ruby/object:Gem::Version
|
99
101
|
version: '0'
|
100
102
|
requirements: []
|
101
|
-
rubygems_version: 3.4.
|
103
|
+
rubygems_version: 3.4.10
|
102
104
|
signing_key:
|
103
105
|
specification_version: 4
|
104
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|