tokenizers 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +125 -90
- data/ext/tokenizers/Cargo.toml +4 -5
- data/ext/tokenizers/src/decoders.rs +100 -1
- data/ext/tokenizers/src/models.rs +15 -0
- data/ext/tokenizers/src/normalizers.rs +28 -2
- data/lib/tokenizers/decoders/strip.rb +9 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/normalizers/prepend.rb +9 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e6e88ec5618e36e317434410c960603695806bb59dadb2252f2957d8dbf0525b
|
|
4
|
+
data.tar.gz: 33a04a4a5faada27e6e7246c16d836a4ff9f6793e89de3cfd4880e30c6c8ed0d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 88e4f2ad57fd1d66cd5fcf0d8b7ff6b1ea902258296fb02d207a446032134189e3445a104658074e94f914331c94f46cfdd09eed7c745c0483cb3b32b09e6abf
|
|
7
|
+
data.tar.gz: e8a1721ecbd36874322477077331743b0d1ba2de6f90076e07ad5456c230f76625d7f28ed6e6026c11395c6bb27701a6b8c0feedf2050387d32d9b777baa51fe
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
|
@@ -71,9 +71,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
|
71
71
|
|
|
72
72
|
[[package]]
|
|
73
73
|
name = "clang-sys"
|
|
74
|
-
version = "1.
|
|
74
|
+
version = "1.6.1"
|
|
75
75
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
76
|
-
checksum = "
|
|
76
|
+
checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
|
|
77
77
|
dependencies = [
|
|
78
78
|
"glob",
|
|
79
79
|
"libc",
|
|
@@ -95,9 +95,9 @@ dependencies = [
|
|
|
95
95
|
|
|
96
96
|
[[package]]
|
|
97
97
|
name = "crossbeam-channel"
|
|
98
|
-
version = "0.5.
|
|
98
|
+
version = "0.5.8"
|
|
99
99
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
100
|
-
checksum = "
|
|
100
|
+
checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
|
|
101
101
|
dependencies = [
|
|
102
102
|
"cfg-if",
|
|
103
103
|
"crossbeam-utils",
|
|
@@ -105,9 +105,9 @@ dependencies = [
|
|
|
105
105
|
|
|
106
106
|
[[package]]
|
|
107
107
|
name = "crossbeam-deque"
|
|
108
|
-
version = "0.8.
|
|
108
|
+
version = "0.8.3"
|
|
109
109
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
110
|
-
checksum = "
|
|
110
|
+
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
|
111
111
|
dependencies = [
|
|
112
112
|
"cfg-if",
|
|
113
113
|
"crossbeam-epoch",
|
|
@@ -116,9 +116,9 @@ dependencies = [
|
|
|
116
116
|
|
|
117
117
|
[[package]]
|
|
118
118
|
name = "crossbeam-epoch"
|
|
119
|
-
version = "0.9.
|
|
119
|
+
version = "0.9.14"
|
|
120
120
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
121
|
-
checksum = "
|
|
121
|
+
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
|
|
122
122
|
dependencies = [
|
|
123
123
|
"autocfg",
|
|
124
124
|
"cfg-if",
|
|
@@ -129,18 +129,18 @@ dependencies = [
|
|
|
129
129
|
|
|
130
130
|
[[package]]
|
|
131
131
|
name = "crossbeam-utils"
|
|
132
|
-
version = "0.8.
|
|
132
|
+
version = "0.8.15"
|
|
133
133
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
134
|
-
checksum = "
|
|
134
|
+
checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
|
|
135
135
|
dependencies = [
|
|
136
136
|
"cfg-if",
|
|
137
137
|
]
|
|
138
138
|
|
|
139
139
|
[[package]]
|
|
140
140
|
name = "darling"
|
|
141
|
-
version = "0.14.
|
|
141
|
+
version = "0.14.4"
|
|
142
142
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
143
|
-
checksum = "
|
|
143
|
+
checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
|
|
144
144
|
dependencies = [
|
|
145
145
|
"darling_core",
|
|
146
146
|
"darling_macro",
|
|
@@ -148,27 +148,27 @@ dependencies = [
|
|
|
148
148
|
|
|
149
149
|
[[package]]
|
|
150
150
|
name = "darling_core"
|
|
151
|
-
version = "0.14.
|
|
151
|
+
version = "0.14.4"
|
|
152
152
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
153
|
-
checksum = "
|
|
153
|
+
checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
|
|
154
154
|
dependencies = [
|
|
155
155
|
"fnv",
|
|
156
156
|
"ident_case",
|
|
157
157
|
"proc-macro2",
|
|
158
158
|
"quote",
|
|
159
159
|
"strsim",
|
|
160
|
-
"syn",
|
|
160
|
+
"syn 1.0.109",
|
|
161
161
|
]
|
|
162
162
|
|
|
163
163
|
[[package]]
|
|
164
164
|
name = "darling_macro"
|
|
165
|
-
version = "0.14.
|
|
165
|
+
version = "0.14.4"
|
|
166
166
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
167
|
-
checksum = "
|
|
167
|
+
checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
|
|
168
168
|
dependencies = [
|
|
169
169
|
"darling_core",
|
|
170
170
|
"quote",
|
|
171
|
-
"syn",
|
|
171
|
+
"syn 1.0.109",
|
|
172
172
|
]
|
|
173
173
|
|
|
174
174
|
[[package]]
|
|
@@ -189,7 +189,7 @@ dependencies = [
|
|
|
189
189
|
"darling",
|
|
190
190
|
"proc-macro2",
|
|
191
191
|
"quote",
|
|
192
|
-
"syn",
|
|
192
|
+
"syn 1.0.109",
|
|
193
193
|
]
|
|
194
194
|
|
|
195
195
|
[[package]]
|
|
@@ -199,7 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
199
199
|
checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
|
|
200
200
|
dependencies = [
|
|
201
201
|
"derive_builder_core",
|
|
202
|
-
"syn",
|
|
202
|
+
"syn 1.0.109",
|
|
203
203
|
]
|
|
204
204
|
|
|
205
205
|
[[package]]
|
|
@@ -231,9 +231,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
|
231
231
|
|
|
232
232
|
[[package]]
|
|
233
233
|
name = "getrandom"
|
|
234
|
-
version = "0.2.
|
|
234
|
+
version = "0.2.9"
|
|
235
235
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
236
|
-
checksum = "
|
|
236
|
+
checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
|
|
237
237
|
dependencies = [
|
|
238
238
|
"cfg-if",
|
|
239
239
|
"libc",
|
|
@@ -293,9 +293,9 @@ dependencies = [
|
|
|
293
293
|
|
|
294
294
|
[[package]]
|
|
295
295
|
name = "itoa"
|
|
296
|
-
version = "1.0.
|
|
296
|
+
version = "1.0.6"
|
|
297
297
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
298
|
-
checksum = "
|
|
298
|
+
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
|
299
299
|
|
|
300
300
|
[[package]]
|
|
301
301
|
name = "lazy_static"
|
|
@@ -311,9 +311,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
|
311
311
|
|
|
312
312
|
[[package]]
|
|
313
313
|
name = "libc"
|
|
314
|
-
version = "0.2.
|
|
314
|
+
version = "0.2.141"
|
|
315
315
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
316
|
-
checksum = "
|
|
316
|
+
checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
|
|
317
317
|
|
|
318
318
|
[[package]]
|
|
319
319
|
name = "libloading"
|
|
@@ -352,9 +352,9 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
|
352
352
|
|
|
353
353
|
[[package]]
|
|
354
354
|
name = "magnus"
|
|
355
|
-
version = "0.5.
|
|
355
|
+
version = "0.5.3"
|
|
356
356
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
357
|
-
checksum = "
|
|
357
|
+
checksum = "c8dc14463c2552e753ef562961f486ca76f17a857c121db40e9f3ade3f35ab81"
|
|
358
358
|
dependencies = [
|
|
359
359
|
"magnus-macros",
|
|
360
360
|
"rb-sys",
|
|
@@ -363,13 +363,13 @@ dependencies = [
|
|
|
363
363
|
|
|
364
364
|
[[package]]
|
|
365
365
|
name = "magnus-macros"
|
|
366
|
-
version = "0.4.
|
|
366
|
+
version = "0.4.1"
|
|
367
367
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
368
|
-
checksum = "
|
|
368
|
+
checksum = "6cc17af1d45442c011aa579d727ec6cff8a69aea8a6bbad26736e7112d749bfb"
|
|
369
369
|
dependencies = [
|
|
370
370
|
"proc-macro2",
|
|
371
371
|
"quote",
|
|
372
|
-
"syn",
|
|
372
|
+
"syn 1.0.109",
|
|
373
373
|
]
|
|
374
374
|
|
|
375
375
|
[[package]]
|
|
@@ -380,9 +380,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
|
|
380
380
|
|
|
381
381
|
[[package]]
|
|
382
382
|
name = "memoffset"
|
|
383
|
-
version = "0.
|
|
383
|
+
version = "0.8.0"
|
|
384
384
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
385
|
-
checksum = "
|
|
385
|
+
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
|
|
386
386
|
dependencies = [
|
|
387
387
|
"autocfg",
|
|
388
388
|
]
|
|
@@ -393,6 +393,27 @@ version = "0.2.1"
|
|
|
393
393
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
394
394
|
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
395
395
|
|
|
396
|
+
[[package]]
|
|
397
|
+
name = "monostate"
|
|
398
|
+
version = "0.1.6"
|
|
399
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
400
|
+
checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
|
|
401
|
+
dependencies = [
|
|
402
|
+
"monostate-impl",
|
|
403
|
+
"serde",
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
[[package]]
|
|
407
|
+
name = "monostate-impl"
|
|
408
|
+
version = "0.1.6"
|
|
409
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
410
|
+
checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
|
|
411
|
+
dependencies = [
|
|
412
|
+
"proc-macro2",
|
|
413
|
+
"quote",
|
|
414
|
+
"syn 2.0.13",
|
|
415
|
+
]
|
|
416
|
+
|
|
396
417
|
[[package]]
|
|
397
418
|
name = "nom"
|
|
398
419
|
version = "7.1.3"
|
|
@@ -421,9 +442,9 @@ checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
|
|
|
421
442
|
|
|
422
443
|
[[package]]
|
|
423
444
|
name = "once_cell"
|
|
424
|
-
version = "1.17.
|
|
445
|
+
version = "1.17.1"
|
|
425
446
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
426
|
-
checksum = "
|
|
447
|
+
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
|
427
448
|
|
|
428
449
|
[[package]]
|
|
429
450
|
name = "onig"
|
|
@@ -449,9 +470,9 @@ dependencies = [
|
|
|
449
470
|
|
|
450
471
|
[[package]]
|
|
451
472
|
name = "paste"
|
|
452
|
-
version = "1.0.
|
|
473
|
+
version = "1.0.12"
|
|
453
474
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
454
|
-
checksum = "
|
|
475
|
+
checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
|
|
455
476
|
|
|
456
477
|
[[package]]
|
|
457
478
|
name = "peeking_take_while"
|
|
@@ -473,18 +494,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
|
473
494
|
|
|
474
495
|
[[package]]
|
|
475
496
|
name = "proc-macro2"
|
|
476
|
-
version = "1.0.
|
|
497
|
+
version = "1.0.56"
|
|
477
498
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
478
|
-
checksum = "
|
|
499
|
+
checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
|
|
479
500
|
dependencies = [
|
|
480
501
|
"unicode-ident",
|
|
481
502
|
]
|
|
482
503
|
|
|
483
504
|
[[package]]
|
|
484
505
|
name = "quote"
|
|
485
|
-
version = "1.0.
|
|
506
|
+
version = "1.0.26"
|
|
486
507
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
487
|
-
checksum = "
|
|
508
|
+
checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
|
|
488
509
|
dependencies = [
|
|
489
510
|
"proc-macro2",
|
|
490
511
|
]
|
|
@@ -521,9 +542,9 @@ dependencies = [
|
|
|
521
542
|
|
|
522
543
|
[[package]]
|
|
523
544
|
name = "rayon"
|
|
524
|
-
version = "1.
|
|
545
|
+
version = "1.7.0"
|
|
525
546
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
526
|
-
checksum = "
|
|
547
|
+
checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
|
|
527
548
|
dependencies = [
|
|
528
549
|
"either",
|
|
529
550
|
"rayon-core",
|
|
@@ -542,9 +563,9 @@ dependencies = [
|
|
|
542
563
|
|
|
543
564
|
[[package]]
|
|
544
565
|
name = "rayon-core"
|
|
545
|
-
version = "1.
|
|
566
|
+
version = "1.11.0"
|
|
546
567
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
547
|
-
checksum = "
|
|
568
|
+
checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
|
|
548
569
|
dependencies = [
|
|
549
570
|
"crossbeam-channel",
|
|
550
571
|
"crossbeam-deque",
|
|
@@ -554,25 +575,26 @@ dependencies = [
|
|
|
554
575
|
|
|
555
576
|
[[package]]
|
|
556
577
|
name = "rb-sys"
|
|
557
|
-
version = "0.9.
|
|
578
|
+
version = "0.9.71"
|
|
558
579
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
559
|
-
checksum = "
|
|
580
|
+
checksum = "156bfedced1e236600bcaad538477097ff2ed5c6b474e411d15b791e1d24c0f1"
|
|
560
581
|
dependencies = [
|
|
561
582
|
"rb-sys-build",
|
|
562
583
|
]
|
|
563
584
|
|
|
564
585
|
[[package]]
|
|
565
586
|
name = "rb-sys-build"
|
|
566
|
-
version = "0.9.
|
|
587
|
+
version = "0.9.71"
|
|
567
588
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
568
|
-
checksum = "
|
|
589
|
+
checksum = "5cb2e4a32cbc290b543a74567072ad24b708aff7bb5dde5a68d5690379cd7938"
|
|
569
590
|
dependencies = [
|
|
570
591
|
"bindgen",
|
|
571
592
|
"lazy_static",
|
|
593
|
+
"proc-macro2",
|
|
572
594
|
"quote",
|
|
573
595
|
"regex",
|
|
574
596
|
"shell-words",
|
|
575
|
-
"syn",
|
|
597
|
+
"syn 1.0.109",
|
|
576
598
|
]
|
|
577
599
|
|
|
578
600
|
[[package]]
|
|
@@ -583,9 +605,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
|
583
605
|
|
|
584
606
|
[[package]]
|
|
585
607
|
name = "regex"
|
|
586
|
-
version = "1.7.
|
|
608
|
+
version = "1.7.3"
|
|
587
609
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
588
|
-
checksum = "
|
|
610
|
+
checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
|
|
589
611
|
dependencies = [
|
|
590
612
|
"aho-corasick",
|
|
591
613
|
"memchr",
|
|
@@ -594,9 +616,9 @@ dependencies = [
|
|
|
594
616
|
|
|
595
617
|
[[package]]
|
|
596
618
|
name = "regex-syntax"
|
|
597
|
-
version = "0.6.
|
|
619
|
+
version = "0.6.29"
|
|
598
620
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
599
|
-
checksum = "
|
|
621
|
+
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
|
600
622
|
|
|
601
623
|
[[package]]
|
|
602
624
|
name = "rustc-hash"
|
|
@@ -606,9 +628,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
|
606
628
|
|
|
607
629
|
[[package]]
|
|
608
630
|
name = "ryu"
|
|
609
|
-
version = "1.0.
|
|
631
|
+
version = "1.0.13"
|
|
610
632
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
611
|
-
checksum = "
|
|
633
|
+
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
|
|
612
634
|
|
|
613
635
|
[[package]]
|
|
614
636
|
name = "scopeguard"
|
|
@@ -618,29 +640,29 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
|
|
618
640
|
|
|
619
641
|
[[package]]
|
|
620
642
|
name = "serde"
|
|
621
|
-
version = "1.0.
|
|
643
|
+
version = "1.0.159"
|
|
622
644
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
623
|
-
checksum = "
|
|
645
|
+
checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
|
|
624
646
|
dependencies = [
|
|
625
647
|
"serde_derive",
|
|
626
648
|
]
|
|
627
649
|
|
|
628
650
|
[[package]]
|
|
629
651
|
name = "serde_derive"
|
|
630
|
-
version = "1.0.
|
|
652
|
+
version = "1.0.159"
|
|
631
653
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
632
|
-
checksum = "
|
|
654
|
+
checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
|
|
633
655
|
dependencies = [
|
|
634
656
|
"proc-macro2",
|
|
635
657
|
"quote",
|
|
636
|
-
"syn",
|
|
658
|
+
"syn 2.0.13",
|
|
637
659
|
]
|
|
638
660
|
|
|
639
661
|
[[package]]
|
|
640
662
|
name = "serde_json"
|
|
641
|
-
version = "1.0.
|
|
663
|
+
version = "1.0.95"
|
|
642
664
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
643
|
-
checksum = "
|
|
665
|
+
checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
|
|
644
666
|
dependencies = [
|
|
645
667
|
"itoa",
|
|
646
668
|
"ryu",
|
|
@@ -685,9 +707,20 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|
|
685
707
|
|
|
686
708
|
[[package]]
|
|
687
709
|
name = "syn"
|
|
688
|
-
version = "1.0.
|
|
710
|
+
version = "1.0.109"
|
|
689
711
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
690
|
-
checksum = "
|
|
712
|
+
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
|
713
|
+
dependencies = [
|
|
714
|
+
"proc-macro2",
|
|
715
|
+
"quote",
|
|
716
|
+
"unicode-ident",
|
|
717
|
+
]
|
|
718
|
+
|
|
719
|
+
[[package]]
|
|
720
|
+
name = "syn"
|
|
721
|
+
version = "2.0.13"
|
|
722
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
723
|
+
checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec"
|
|
691
724
|
dependencies = [
|
|
692
725
|
"proc-macro2",
|
|
693
726
|
"quote",
|
|
@@ -696,38 +729,39 @@ dependencies = [
|
|
|
696
729
|
|
|
697
730
|
[[package]]
|
|
698
731
|
name = "thiserror"
|
|
699
|
-
version = "1.0.
|
|
732
|
+
version = "1.0.40"
|
|
700
733
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
701
|
-
checksum = "
|
|
734
|
+
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
|
|
702
735
|
dependencies = [
|
|
703
736
|
"thiserror-impl",
|
|
704
737
|
]
|
|
705
738
|
|
|
706
739
|
[[package]]
|
|
707
740
|
name = "thiserror-impl"
|
|
708
|
-
version = "1.0.
|
|
741
|
+
version = "1.0.40"
|
|
709
742
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
710
|
-
checksum = "
|
|
743
|
+
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
|
|
711
744
|
dependencies = [
|
|
712
745
|
"proc-macro2",
|
|
713
746
|
"quote",
|
|
714
|
-
"syn",
|
|
747
|
+
"syn 2.0.13",
|
|
715
748
|
]
|
|
716
749
|
|
|
717
750
|
[[package]]
|
|
718
751
|
name = "tokenizers"
|
|
719
|
-
version = "0.3.
|
|
752
|
+
version = "0.3.3"
|
|
720
753
|
dependencies = [
|
|
721
754
|
"magnus",
|
|
722
755
|
"onig",
|
|
723
756
|
"serde",
|
|
724
|
-
"tokenizers 0.13.
|
|
757
|
+
"tokenizers 0.13.3",
|
|
725
758
|
]
|
|
726
759
|
|
|
727
760
|
[[package]]
|
|
728
761
|
name = "tokenizers"
|
|
729
|
-
version = "0.13.
|
|
730
|
-
source = "
|
|
762
|
+
version = "0.13.3"
|
|
763
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
764
|
+
checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
|
|
731
765
|
dependencies = [
|
|
732
766
|
"aho-corasick",
|
|
733
767
|
"derive_builder",
|
|
@@ -738,6 +772,7 @@ dependencies = [
|
|
|
738
772
|
"lazy_static",
|
|
739
773
|
"log",
|
|
740
774
|
"macro_rules_attribute",
|
|
775
|
+
"monostate",
|
|
741
776
|
"onig",
|
|
742
777
|
"paste",
|
|
743
778
|
"rand",
|
|
@@ -756,9 +791,9 @@ dependencies = [
|
|
|
756
791
|
|
|
757
792
|
[[package]]
|
|
758
793
|
name = "unicode-ident"
|
|
759
|
-
version = "1.0.
|
|
794
|
+
version = "1.0.8"
|
|
760
795
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
761
|
-
checksum = "
|
|
796
|
+
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
|
|
762
797
|
|
|
763
798
|
[[package]]
|
|
764
799
|
name = "unicode-normalization-alignments"
|
|
@@ -832,42 +867,42 @@ dependencies = [
|
|
|
832
867
|
|
|
833
868
|
[[package]]
|
|
834
869
|
name = "windows_aarch64_gnullvm"
|
|
835
|
-
version = "0.42.
|
|
870
|
+
version = "0.42.2"
|
|
836
871
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
837
|
-
checksum = "
|
|
872
|
+
checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
|
|
838
873
|
|
|
839
874
|
[[package]]
|
|
840
875
|
name = "windows_aarch64_msvc"
|
|
841
|
-
version = "0.42.
|
|
876
|
+
version = "0.42.2"
|
|
842
877
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
843
|
-
checksum = "
|
|
878
|
+
checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
|
|
844
879
|
|
|
845
880
|
[[package]]
|
|
846
881
|
name = "windows_i686_gnu"
|
|
847
|
-
version = "0.42.
|
|
882
|
+
version = "0.42.2"
|
|
848
883
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
849
|
-
checksum = "
|
|
884
|
+
checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
|
|
850
885
|
|
|
851
886
|
[[package]]
|
|
852
887
|
name = "windows_i686_msvc"
|
|
853
|
-
version = "0.42.
|
|
888
|
+
version = "0.42.2"
|
|
854
889
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
855
|
-
checksum = "
|
|
890
|
+
checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
|
|
856
891
|
|
|
857
892
|
[[package]]
|
|
858
893
|
name = "windows_x86_64_gnu"
|
|
859
|
-
version = "0.42.
|
|
894
|
+
version = "0.42.2"
|
|
860
895
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
861
|
-
checksum = "
|
|
896
|
+
checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
|
|
862
897
|
|
|
863
898
|
[[package]]
|
|
864
899
|
name = "windows_x86_64_gnullvm"
|
|
865
|
-
version = "0.42.
|
|
900
|
+
version = "0.42.2"
|
|
866
901
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
867
|
-
checksum = "
|
|
902
|
+
checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
|
|
868
903
|
|
|
869
904
|
[[package]]
|
|
870
905
|
name = "windows_x86_64_msvc"
|
|
871
|
-
version = "0.42.
|
|
906
|
+
version = "0.42.2"
|
|
872
907
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
873
|
-
checksum = "
|
|
908
|
+
checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.3"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
@@ -11,11 +11,10 @@ crate-type = ["cdylib"]
|
|
|
11
11
|
|
|
12
12
|
[dependencies]
|
|
13
13
|
magnus = "0.5"
|
|
14
|
-
onig = { version = "6
|
|
15
|
-
serde = { version = "1
|
|
14
|
+
onig = { version = "6", default-features = false }
|
|
15
|
+
serde = { version = "1", features = ["rc", "derive"] }
|
|
16
16
|
|
|
17
17
|
[dependencies.tokenizers]
|
|
18
|
-
version = "0.13.
|
|
19
|
-
git = "https://github.com/huggingface/tokenizers"
|
|
18
|
+
version = "=0.13.3" # also update in from_pretrained.rb
|
|
20
19
|
default-features = false
|
|
21
20
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -7,14 +7,19 @@ use magnus::{
|
|
|
7
7
|
};
|
|
8
8
|
use serde::{Deserialize, Serialize};
|
|
9
9
|
use tk::decoders::bpe::BPEDecoder;
|
|
10
|
+
use tk::decoders::byte_fallback::ByteFallback;
|
|
10
11
|
use tk::decoders::byte_level::ByteLevel;
|
|
11
12
|
use tk::decoders::ctc::CTC;
|
|
13
|
+
use tk::decoders::fuse::Fuse;
|
|
12
14
|
use tk::decoders::metaspace::Metaspace;
|
|
15
|
+
use tk::decoders::strip::Strip;
|
|
13
16
|
use tk::decoders::wordpiece::WordPiece;
|
|
14
17
|
use tk::decoders::DecoderWrapper;
|
|
15
18
|
use tk::Decoder;
|
|
19
|
+
use tk::normalizers::replace::Replace;
|
|
16
20
|
|
|
17
|
-
use super::
|
|
21
|
+
use super::utils::*;
|
|
22
|
+
use super::{RbError, RbResult};
|
|
18
23
|
|
|
19
24
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
|
20
25
|
pub struct RbDecoder {
|
|
@@ -89,6 +94,30 @@ impl RbDecoder {
|
|
|
89
94
|
setter!(self, CTC, word_delimiter_token, word_delimiter_token);
|
|
90
95
|
}
|
|
91
96
|
|
|
97
|
+
fn strip_content(&self) -> char {
|
|
98
|
+
getter!(self, Strip, content)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
fn strip_set_content(&self, content: char) {
|
|
102
|
+
setter!(self, Strip, content, content)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
fn strip_start(&self) -> usize {
|
|
106
|
+
getter!(self, Strip, start)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
fn strip_set_start(&self, start: usize) {
|
|
110
|
+
setter!(self, Strip, start, start)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
fn strip_stop(&self) -> usize {
|
|
114
|
+
getter!(self, Strip, stop)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
fn strip_set_stop(&self, stop: usize) {
|
|
118
|
+
setter!(self, Strip, stop, stop)
|
|
119
|
+
}
|
|
120
|
+
|
|
92
121
|
pub fn metaspace_replacement(&self) -> char {
|
|
93
122
|
getter!(self, Metaspace, get_replacement().clone())
|
|
94
123
|
}
|
|
@@ -130,6 +159,14 @@ impl RbBPEDecoder {
|
|
|
130
159
|
}
|
|
131
160
|
}
|
|
132
161
|
|
|
162
|
+
pub struct RbByteFallbackDecoder {}
|
|
163
|
+
|
|
164
|
+
impl RbByteFallbackDecoder {
|
|
165
|
+
pub fn new() -> RbDecoder {
|
|
166
|
+
ByteFallback::default().into()
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
133
170
|
pub struct RbByteLevelDecoder {}
|
|
134
171
|
|
|
135
172
|
impl RbByteLevelDecoder {
|
|
@@ -146,6 +183,14 @@ impl RbCTC {
|
|
|
146
183
|
}
|
|
147
184
|
}
|
|
148
185
|
|
|
186
|
+
pub struct RbFuse {}
|
|
187
|
+
|
|
188
|
+
impl RbFuse {
|
|
189
|
+
pub fn new() -> RbDecoder {
|
|
190
|
+
Fuse::default().into()
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
149
194
|
pub struct RbMetaspaceDecoder {}
|
|
150
195
|
|
|
151
196
|
impl RbMetaspaceDecoder {
|
|
@@ -154,6 +199,22 @@ impl RbMetaspaceDecoder {
|
|
|
154
199
|
}
|
|
155
200
|
}
|
|
156
201
|
|
|
202
|
+
pub struct RbReplaceDecoder {}
|
|
203
|
+
|
|
204
|
+
impl RbReplaceDecoder {
|
|
205
|
+
pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
|
|
206
|
+
Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
pub struct RbStripDecoder {}
|
|
211
|
+
|
|
212
|
+
impl RbStripDecoder {
|
|
213
|
+
pub fn new(content: char, start: usize, stop: usize) -> RbDecoder {
|
|
214
|
+
Strip::new(content, start, stop).into()
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
157
218
|
pub struct RbWordPieceDecoder {}
|
|
158
219
|
|
|
159
220
|
impl RbWordPieceDecoder {
|
|
@@ -219,6 +280,11 @@ unsafe impl TypedData for RbDecoder {
|
|
|
219
280
|
class.undef_alloc_func();
|
|
220
281
|
class
|
|
221
282
|
}),
|
|
283
|
+
DecoderWrapper::ByteFallback(_) => *memoize!(RClass: {
|
|
284
|
+
let class: RClass = crate::decoders().const_get("ByteFallback").unwrap();
|
|
285
|
+
class.undef_alloc_func();
|
|
286
|
+
class
|
|
287
|
+
}),
|
|
222
288
|
DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
|
|
223
289
|
let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
|
|
224
290
|
class.undef_alloc_func();
|
|
@@ -229,11 +295,26 @@ unsafe impl TypedData for RbDecoder {
|
|
|
229
295
|
class.undef_alloc_func();
|
|
230
296
|
class
|
|
231
297
|
}),
|
|
298
|
+
DecoderWrapper::Fuse(_) => *memoize!(RClass: {
|
|
299
|
+
let class: RClass = crate::decoders().const_get("Fuse").unwrap();
|
|
300
|
+
class.undef_alloc_func();
|
|
301
|
+
class
|
|
302
|
+
}),
|
|
232
303
|
DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
|
|
233
304
|
let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
|
|
234
305
|
class.undef_alloc_func();
|
|
235
306
|
class
|
|
236
307
|
}),
|
|
308
|
+
DecoderWrapper::Replace(_) => *memoize!(RClass: {
|
|
309
|
+
let class: RClass = crate::decoders().const_get("Replace").unwrap();
|
|
310
|
+
class.undef_alloc_func();
|
|
311
|
+
class
|
|
312
|
+
}),
|
|
313
|
+
DecoderWrapper::Strip(_) => *memoize!(RClass: {
|
|
314
|
+
let class: RClass = crate::decoders().const_get("Strip").unwrap();
|
|
315
|
+
class.undef_alloc_func();
|
|
316
|
+
class
|
|
317
|
+
}),
|
|
237
318
|
DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
|
|
238
319
|
let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
|
|
239
320
|
class.undef_alloc_func();
|
|
@@ -253,6 +334,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
|
253
334
|
class.define_method("suffix", method!(RbDecoder::bpe_suffix, 0))?;
|
|
254
335
|
class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
|
|
255
336
|
|
|
337
|
+
let class = module.define_class("ByteFallback", decoder)?;
|
|
338
|
+
class.define_singleton_method("new", function!(RbByteFallbackDecoder::new, 0))?;
|
|
339
|
+
|
|
256
340
|
let class = module.define_class("ByteLevel", decoder)?;
|
|
257
341
|
class.define_singleton_method("new", function!(RbByteLevelDecoder::new, 0))?;
|
|
258
342
|
|
|
@@ -265,6 +349,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
|
265
349
|
class.define_method("word_delimiter_token", method!(RbDecoder::ctc_word_delimiter_token, 0))?;
|
|
266
350
|
class.define_method("word_delimiter_token=", method!(RbDecoder::ctc_set_word_delimiter_token, 1))?;
|
|
267
351
|
|
|
352
|
+
let class = module.define_class("Fuse", decoder)?;
|
|
353
|
+
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
|
354
|
+
|
|
268
355
|
let class = module.define_class("Metaspace", decoder)?;
|
|
269
356
|
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
|
|
270
357
|
class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
|
|
@@ -272,6 +359,18 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
|
272
359
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
|
273
360
|
class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
|
|
274
361
|
|
|
362
|
+
let class = module.define_class("Replace", decoder)?;
|
|
363
|
+
class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
|
|
364
|
+
|
|
365
|
+
let class = module.define_class("Strip", decoder)?;
|
|
366
|
+
class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
|
|
367
|
+
class.define_method("content", method!(RbDecoder::strip_content, 0))?;
|
|
368
|
+
class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
|
|
369
|
+
class.define_method("start", method!(RbDecoder::strip_start, 0))?;
|
|
370
|
+
class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
|
|
371
|
+
class.define_method("stop", method!(RbDecoder::strip_stop, 0))?;
|
|
372
|
+
class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
|
|
373
|
+
|
|
275
374
|
let class = module.define_class("WordPiece", decoder)?;
|
|
276
375
|
class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
|
|
277
376
|
class.define_method("cleanup", method!(RbDecoder::word_piece_cleanup, 0))?;
|
|
@@ -101,6 +101,11 @@ impl RbBPE {
|
|
|
101
101
|
builder = builder.fuse_unk(value.try_convert()?);
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
+
let value: Value = kwargs.delete(Symbol::new("byte_fallback"))?;
|
|
105
|
+
if !value.is_nil() {
|
|
106
|
+
builder = builder.byte_fallback(value.try_convert()?);
|
|
107
|
+
}
|
|
108
|
+
|
|
104
109
|
if !kwargs.is_empty() {
|
|
105
110
|
// TODO improve message
|
|
106
111
|
return Err(Error::new(exception::arg_error(), "unknown keyword"));
|
|
@@ -169,6 +174,14 @@ impl RbModel {
|
|
|
169
174
|
setter!(self, BPE, fuse_unk, fuse_unk);
|
|
170
175
|
}
|
|
171
176
|
|
|
177
|
+
pub fn bpe_byte_fallback(&self) -> bool {
|
|
178
|
+
getter!(self, BPE, byte_fallback)
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
pub fn bpe_set_byte_fallback(&self, byte_fallback: bool) {
|
|
182
|
+
setter!(self, BPE, byte_fallback, byte_fallback);
|
|
183
|
+
}
|
|
184
|
+
|
|
172
185
|
pub fn bpe_continuing_subword_prefix(&self) -> Option<String> {
|
|
173
186
|
getter!(self, BPE, continuing_subword_prefix.clone())
|
|
174
187
|
}
|
|
@@ -355,6 +368,8 @@ pub fn models(module: &RModule) -> RbResult<()> {
|
|
|
355
368
|
class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
|
|
356
369
|
class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
|
|
357
370
|
class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
|
|
371
|
+
class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
|
|
372
|
+
class.define_method("byte_fallback=", method!(RbModel::bpe_set_byte_fallback, 1))?;
|
|
358
373
|
|
|
359
374
|
let class = module.define_class("Unigram", model)?;
|
|
360
375
|
class.define_singleton_method("_new", function!(RbUnigram::new, 2))?;
|
|
@@ -8,7 +8,7 @@ use magnus::{
|
|
|
8
8
|
use serde::ser::SerializeStruct;
|
|
9
9
|
use serde::{Deserialize, Serialize, Serializer};
|
|
10
10
|
use tk::normalizers::{
|
|
11
|
-
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Strip, StripAccents,
|
|
11
|
+
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
|
|
12
12
|
NFC, NFD, NFKC, NFKD,
|
|
13
13
|
};
|
|
14
14
|
use tk::{NormalizedString, Normalizer};
|
|
@@ -44,7 +44,7 @@ macro_rules! getter {
|
|
|
44
44
|
($self: ident, $variant: ident, $name: ident) => {{
|
|
45
45
|
if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
|
|
46
46
|
let wrapper = norm.read().unwrap();
|
|
47
|
-
if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper {
|
|
47
|
+
if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
|
|
48
48
|
o.$name
|
|
49
49
|
} else {
|
|
50
50
|
unreachable!()
|
|
@@ -105,6 +105,14 @@ impl RbNormalizer {
|
|
|
105
105
|
setter!(self, BertNormalizer, lowercase, lowercase)
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
fn prepend_prepend(&self) -> String {
|
|
109
|
+
getter!(self, Prepend, prepend)
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
fn prepend_set_prepend(&self, prepend: String) {
|
|
113
|
+
setter!(self, Prepend, prepend, prepend)
|
|
114
|
+
}
|
|
115
|
+
|
|
108
116
|
fn strip_left(&self) -> bool {
|
|
109
117
|
getter!(self, StripNormalizer, strip_left)
|
|
110
118
|
}
|
|
@@ -186,6 +194,14 @@ impl RbReplace {
|
|
|
186
194
|
}
|
|
187
195
|
}
|
|
188
196
|
|
|
197
|
+
pub struct RbPrepend {}
|
|
198
|
+
|
|
199
|
+
impl RbPrepend {
|
|
200
|
+
pub fn new(prepend: String) -> RbNormalizer {
|
|
201
|
+
Prepend::new(prepend).into()
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
189
205
|
pub struct RbStrip {}
|
|
190
206
|
|
|
191
207
|
impl RbStrip {
|
|
@@ -372,6 +388,11 @@ unsafe impl TypedData for RbNormalizer {
|
|
|
372
388
|
class.undef_alloc_func();
|
|
373
389
|
class
|
|
374
390
|
}),
|
|
391
|
+
NormalizerWrapper::Prepend(_) => *memoize!(RClass: {
|
|
392
|
+
let class: RClass = crate::normalizers().const_get("Prepend").unwrap();
|
|
393
|
+
class.undef_alloc_func();
|
|
394
|
+
class
|
|
395
|
+
}),
|
|
375
396
|
NormalizerWrapper::StripNormalizer(_) => *memoize!(RClass: {
|
|
376
397
|
let class: RClass = crate::normalizers().const_get("Strip").unwrap();
|
|
377
398
|
class.undef_alloc_func();
|
|
@@ -428,6 +449,11 @@ pub fn normalizers(module: &RModule) -> RbResult<()> {
|
|
|
428
449
|
let class = module.define_class("Replace", normalizer)?;
|
|
429
450
|
class.define_singleton_method("new", function!(RbReplace::new, 2))?;
|
|
430
451
|
|
|
452
|
+
let class = module.define_class("Prepend", normalizer)?;
|
|
453
|
+
class.define_singleton_method("_new", function!(RbPrepend::new, 1))?;
|
|
454
|
+
class.define_method("prepend", method!(RbNormalizer::prepend_prepend, 0))?;
|
|
455
|
+
class.define_method("prepend=", method!(RbNormalizer::prepend_set_prepend, 1))?;
|
|
456
|
+
|
|
431
457
|
let class = module.define_class("Strip", normalizer)?;
|
|
432
458
|
class.define_singleton_method("_new", function!(RbStrip::new, 2))?;
|
|
433
459
|
class.define_method("left", method!(RbNormalizer::strip_left, 0))?;
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
|
@@ -9,6 +9,7 @@ end
|
|
|
9
9
|
require_relative "tokenizers/decoders/bpe_decoder"
|
|
10
10
|
require_relative "tokenizers/decoders/ctc"
|
|
11
11
|
require_relative "tokenizers/decoders/metaspace"
|
|
12
|
+
require_relative "tokenizers/decoders/strip"
|
|
12
13
|
require_relative "tokenizers/decoders/word_piece"
|
|
13
14
|
|
|
14
15
|
# models
|
|
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
|
|
|
19
20
|
|
|
20
21
|
# normalizers
|
|
21
22
|
require_relative "tokenizers/normalizers/bert_normalizer"
|
|
23
|
+
require_relative "tokenizers/normalizers/prepend"
|
|
22
24
|
require_relative "tokenizers/normalizers/strip"
|
|
23
25
|
|
|
24
26
|
# pre-tokenizers
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2023-
|
|
11
|
+
date: 2023-04-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -56,6 +56,7 @@ files:
|
|
|
56
56
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
|
57
57
|
- lib/tokenizers/decoders/ctc.rb
|
|
58
58
|
- lib/tokenizers/decoders/metaspace.rb
|
|
59
|
+
- lib/tokenizers/decoders/strip.rb
|
|
59
60
|
- lib/tokenizers/decoders/word_piece.rb
|
|
60
61
|
- lib/tokenizers/encoding.rb
|
|
61
62
|
- lib/tokenizers/from_pretrained.rb
|
|
@@ -64,6 +65,7 @@ files:
|
|
|
64
65
|
- lib/tokenizers/models/word_level.rb
|
|
65
66
|
- lib/tokenizers/models/word_piece.rb
|
|
66
67
|
- lib/tokenizers/normalizers/bert_normalizer.rb
|
|
68
|
+
- lib/tokenizers/normalizers/prepend.rb
|
|
67
69
|
- lib/tokenizers/normalizers/strip.rb
|
|
68
70
|
- lib/tokenizers/pre_tokenizers/byte_level.rb
|
|
69
71
|
- lib/tokenizers/pre_tokenizers/digits.rb
|
|
@@ -98,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
98
100
|
- !ruby/object:Gem::Version
|
|
99
101
|
version: '0'
|
|
100
102
|
requirements: []
|
|
101
|
-
rubygems_version: 3.4.
|
|
103
|
+
rubygems_version: 3.4.10
|
|
102
104
|
signing_key:
|
|
103
105
|
specification_version: 4
|
|
104
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|