tokenizers 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +154 -83
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/decoders.rs +32 -14
- data/ext/tokenizers/src/error.rs +6 -1
- data/ext/tokenizers/src/lib.rs +37 -12
- data/ext/tokenizers/src/models.rs +75 -23
- data/ext/tokenizers/src/normalizers.rs +84 -24
- data/ext/tokenizers/src/pre_tokenizers.rs +121 -42
- data/ext/tokenizers/src/processors.rs +22 -10
- data/ext/tokenizers/src/tokenizer.rs +63 -34
- data/ext/tokenizers/src/trainers.rs +215 -56
- data/ext/tokenizers/src/utils/regex.rs +6 -4
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8394d394a6ebaa502c53d08508586044c1f6e0ea8cd8c6629e6a7c2bed38518e
|
4
|
+
data.tar.gz: 1b54fa285fb6799c2cbc411c21c0f951db1fdacc3291b1da1971f420ada07820
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f75af568151b9aa3fb9b57c2020464e5d1270174a25adff113f9058b8a92c11288f4df22c1f382af47c9312c1cca83f90214b1875a37334e232bd35bbfd4785e
|
7
|
+
data.tar.gz: a5b9d665dd2f985f03ea1056887169318c679a4fd30bc768f00109bd2d225958233f57f254a7c1e505c0b3b3fdb94b8a2fcd4027c55405088c6c1e88e3fd24be
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -19,9 +19,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
19
19
|
|
20
20
|
[[package]]
|
21
21
|
name = "bindgen"
|
22
|
-
version = "0.69.
|
22
|
+
version = "0.69.5"
|
23
23
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
24
|
-
checksum = "
|
24
|
+
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
|
25
25
|
dependencies = [
|
26
26
|
"bitflags 2.6.0",
|
27
27
|
"cexpr",
|
@@ -49,6 +49,12 @@ version = "2.6.0"
|
|
49
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
50
|
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
51
51
|
|
52
|
+
[[package]]
|
53
|
+
name = "bumpalo"
|
54
|
+
version = "3.16.0"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
57
|
+
|
52
58
|
[[package]]
|
53
59
|
name = "byteorder"
|
54
60
|
version = "1.5.0"
|
@@ -57,9 +63,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
|
57
63
|
|
58
64
|
[[package]]
|
59
65
|
name = "cc"
|
60
|
-
version = "1.
|
66
|
+
version = "1.2.6"
|
61
67
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
68
|
+
checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333"
|
63
69
|
dependencies = [
|
64
70
|
"shlex",
|
65
71
|
]
|
@@ -92,22 +98,22 @@ dependencies = [
|
|
92
98
|
|
93
99
|
[[package]]
|
94
100
|
name = "console"
|
95
|
-
version = "0.15.
|
101
|
+
version = "0.15.10"
|
96
102
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
97
|
-
checksum = "
|
103
|
+
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
|
98
104
|
dependencies = [
|
99
105
|
"encode_unicode",
|
100
|
-
"lazy_static",
|
101
106
|
"libc",
|
107
|
+
"once_cell",
|
102
108
|
"unicode-width",
|
103
109
|
"windows-sys",
|
104
110
|
]
|
105
111
|
|
106
112
|
[[package]]
|
107
113
|
name = "crossbeam-deque"
|
108
|
-
version = "0.8.
|
114
|
+
version = "0.8.6"
|
109
115
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
110
|
-
checksum = "
|
116
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
111
117
|
dependencies = [
|
112
118
|
"crossbeam-epoch",
|
113
119
|
"crossbeam-utils",
|
@@ -124,9 +130,9 @@ dependencies = [
|
|
124
130
|
|
125
131
|
[[package]]
|
126
132
|
name = "crossbeam-utils"
|
127
|
-
version = "0.8.
|
133
|
+
version = "0.8.21"
|
128
134
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
129
|
-
checksum = "
|
135
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
130
136
|
|
131
137
|
[[package]]
|
132
138
|
name = "darling"
|
@@ -165,18 +171,18 @@ dependencies = [
|
|
165
171
|
|
166
172
|
[[package]]
|
167
173
|
name = "derive_builder"
|
168
|
-
version = "0.20.
|
174
|
+
version = "0.20.2"
|
169
175
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
170
|
-
checksum = "
|
176
|
+
checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
|
171
177
|
dependencies = [
|
172
178
|
"derive_builder_macro",
|
173
179
|
]
|
174
180
|
|
175
181
|
[[package]]
|
176
182
|
name = "derive_builder_core"
|
177
|
-
version = "0.20.
|
183
|
+
version = "0.20.2"
|
178
184
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
179
|
-
checksum = "
|
185
|
+
checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
|
180
186
|
dependencies = [
|
181
187
|
"darling",
|
182
188
|
"proc-macro2",
|
@@ -186,9 +192,9 @@ dependencies = [
|
|
186
192
|
|
187
193
|
[[package]]
|
188
194
|
name = "derive_builder_macro"
|
189
|
-
version = "0.20.
|
195
|
+
version = "0.20.2"
|
190
196
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
191
|
-
checksum = "
|
197
|
+
checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
|
192
198
|
dependencies = [
|
193
199
|
"derive_builder_core",
|
194
200
|
"syn",
|
@@ -202,9 +208,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
|
202
208
|
|
203
209
|
[[package]]
|
204
210
|
name = "encode_unicode"
|
205
|
-
version = "0.
|
211
|
+
version = "1.0.0"
|
206
212
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
207
|
-
checksum = "
|
213
|
+
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
208
214
|
|
209
215
|
[[package]]
|
210
216
|
name = "esaxx-rs"
|
@@ -234,9 +240,9 @@ dependencies = [
|
|
234
240
|
|
235
241
|
[[package]]
|
236
242
|
name = "glob"
|
237
|
-
version = "0.3.
|
243
|
+
version = "0.3.2"
|
238
244
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
239
|
-
checksum = "
|
245
|
+
checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
|
240
246
|
|
241
247
|
[[package]]
|
242
248
|
name = "ident_case"
|
@@ -246,24 +252,15 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|
246
252
|
|
247
253
|
[[package]]
|
248
254
|
name = "indicatif"
|
249
|
-
version = "0.17.
|
255
|
+
version = "0.17.9"
|
250
256
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
251
|
-
checksum = "
|
257
|
+
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
|
252
258
|
dependencies = [
|
253
259
|
"console",
|
254
|
-
"instant",
|
255
260
|
"number_prefix",
|
256
261
|
"portable-atomic",
|
257
262
|
"unicode-width",
|
258
|
-
|
259
|
-
|
260
|
-
[[package]]
|
261
|
-
name = "instant"
|
262
|
-
version = "0.1.13"
|
263
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
264
|
-
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
265
|
-
dependencies = [
|
266
|
-
"cfg-if",
|
263
|
+
"web-time",
|
267
264
|
]
|
268
265
|
|
269
266
|
[[package]]
|
@@ -286,9 +283,19 @@ dependencies = [
|
|
286
283
|
|
287
284
|
[[package]]
|
288
285
|
name = "itoa"
|
289
|
-
version = "1.0.
|
286
|
+
version = "1.0.14"
|
290
287
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
291
|
-
checksum = "
|
288
|
+
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
289
|
+
|
290
|
+
[[package]]
|
291
|
+
name = "js-sys"
|
292
|
+
version = "0.3.76"
|
293
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
294
|
+
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
|
295
|
+
dependencies = [
|
296
|
+
"once_cell",
|
297
|
+
"wasm-bindgen",
|
298
|
+
]
|
292
299
|
|
293
300
|
[[package]]
|
294
301
|
name = "lazy_static"
|
@@ -304,15 +311,15 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
304
311
|
|
305
312
|
[[package]]
|
306
313
|
name = "libc"
|
307
|
-
version = "0.2.
|
314
|
+
version = "0.2.169"
|
308
315
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
309
|
-
checksum = "
|
316
|
+
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
310
317
|
|
311
318
|
[[package]]
|
312
319
|
name = "libloading"
|
313
|
-
version = "0.8.
|
320
|
+
version = "0.8.6"
|
314
321
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
315
|
-
checksum = "
|
322
|
+
checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
|
316
323
|
dependencies = [
|
317
324
|
"cfg-if",
|
318
325
|
"windows-targets",
|
@@ -414,9 +421,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
|
414
421
|
|
415
422
|
[[package]]
|
416
423
|
name = "once_cell"
|
417
|
-
version = "1.
|
424
|
+
version = "1.20.2"
|
418
425
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
419
|
-
checksum = "
|
426
|
+
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
420
427
|
|
421
428
|
[[package]]
|
422
429
|
name = "onig"
|
@@ -448,15 +455,15 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
|
448
455
|
|
449
456
|
[[package]]
|
450
457
|
name = "pkg-config"
|
451
|
-
version = "0.3.
|
458
|
+
version = "0.3.31"
|
452
459
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
453
|
-
checksum = "
|
460
|
+
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
454
461
|
|
455
462
|
[[package]]
|
456
463
|
name = "portable-atomic"
|
457
|
-
version = "1.
|
464
|
+
version = "1.10.0"
|
458
465
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
459
|
-
checksum = "
|
466
|
+
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
460
467
|
|
461
468
|
[[package]]
|
462
469
|
name = "ppv-lite86"
|
@@ -469,18 +476,18 @@ dependencies = [
|
|
469
476
|
|
470
477
|
[[package]]
|
471
478
|
name = "proc-macro2"
|
472
|
-
version = "1.0.
|
479
|
+
version = "1.0.92"
|
473
480
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
474
|
-
checksum = "
|
481
|
+
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
|
475
482
|
dependencies = [
|
476
483
|
"unicode-ident",
|
477
484
|
]
|
478
485
|
|
479
486
|
[[package]]
|
480
487
|
name = "quote"
|
481
|
-
version = "1.0.
|
488
|
+
version = "1.0.38"
|
482
489
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
483
|
-
checksum = "
|
490
|
+
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
|
484
491
|
dependencies = [
|
485
492
|
"proc-macro2",
|
486
493
|
]
|
@@ -548,18 +555,18 @@ dependencies = [
|
|
548
555
|
|
549
556
|
[[package]]
|
550
557
|
name = "rb-sys"
|
551
|
-
version = "0.9.
|
558
|
+
version = "0.9.105"
|
552
559
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
553
|
-
checksum = "
|
560
|
+
checksum = "4b3a1f3ce8e7c36d777d52fe7a99039fe4fea7c8ec355a4c4f3a17f92a14029f"
|
554
561
|
dependencies = [
|
555
562
|
"rb-sys-build",
|
556
563
|
]
|
557
564
|
|
558
565
|
[[package]]
|
559
566
|
name = "rb-sys-build"
|
560
|
-
version = "0.9.
|
567
|
+
version = "0.9.105"
|
561
568
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
562
|
-
checksum = "
|
569
|
+
checksum = "3e6b246c29c0809e1cbe60a1ba9e093da72a4676d02adc68469297d1e589bbf0"
|
563
570
|
dependencies = [
|
564
571
|
"bindgen",
|
565
572
|
"lazy_static",
|
@@ -578,9 +585,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
578
585
|
|
579
586
|
[[package]]
|
580
587
|
name = "regex"
|
581
|
-
version = "1.
|
588
|
+
version = "1.11.1"
|
582
589
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
583
|
-
checksum = "
|
590
|
+
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
584
591
|
dependencies = [
|
585
592
|
"aho-corasick",
|
586
593
|
"memchr",
|
@@ -590,9 +597,9 @@ dependencies = [
|
|
590
597
|
|
591
598
|
[[package]]
|
592
599
|
name = "regex-automata"
|
593
|
-
version = "0.4.
|
600
|
+
version = "0.4.9"
|
594
601
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
595
|
-
checksum = "
|
602
|
+
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
|
596
603
|
dependencies = [
|
597
604
|
"aho-corasick",
|
598
605
|
"memchr",
|
@@ -601,9 +608,9 @@ dependencies = [
|
|
601
608
|
|
602
609
|
[[package]]
|
603
610
|
name = "regex-syntax"
|
604
|
-
version = "0.8.
|
611
|
+
version = "0.8.5"
|
605
612
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
606
|
-
checksum = "
|
613
|
+
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
607
614
|
|
608
615
|
[[package]]
|
609
616
|
name = "rustc-hash"
|
@@ -625,18 +632,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
625
632
|
|
626
633
|
[[package]]
|
627
634
|
name = "serde"
|
628
|
-
version = "1.0.
|
635
|
+
version = "1.0.217"
|
629
636
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
630
|
-
checksum = "
|
637
|
+
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
|
631
638
|
dependencies = [
|
632
639
|
"serde_derive",
|
633
640
|
]
|
634
641
|
|
635
642
|
[[package]]
|
636
643
|
name = "serde_derive"
|
637
|
-
version = "1.0.
|
644
|
+
version = "1.0.217"
|
638
645
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
639
|
-
checksum = "
|
646
|
+
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
|
640
647
|
dependencies = [
|
641
648
|
"proc-macro2",
|
642
649
|
"quote",
|
@@ -645,9 +652,9 @@ dependencies = [
|
|
645
652
|
|
646
653
|
[[package]]
|
647
654
|
name = "serde_json"
|
648
|
-
version = "1.0.
|
655
|
+
version = "1.0.134"
|
649
656
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
650
|
-
checksum = "
|
657
|
+
checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d"
|
651
658
|
dependencies = [
|
652
659
|
"itoa",
|
653
660
|
"memchr",
|
@@ -693,9 +700,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
693
700
|
|
694
701
|
[[package]]
|
695
702
|
name = "syn"
|
696
|
-
version = "2.0.
|
703
|
+
version = "2.0.93"
|
697
704
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
698
|
-
checksum = "
|
705
|
+
checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058"
|
699
706
|
dependencies = [
|
700
707
|
"proc-macro2",
|
701
708
|
"quote",
|
@@ -704,18 +711,18 @@ dependencies = [
|
|
704
711
|
|
705
712
|
[[package]]
|
706
713
|
name = "thiserror"
|
707
|
-
version = "1.0.
|
714
|
+
version = "1.0.69"
|
708
715
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
709
|
-
checksum = "
|
716
|
+
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
710
717
|
dependencies = [
|
711
718
|
"thiserror-impl",
|
712
719
|
]
|
713
720
|
|
714
721
|
[[package]]
|
715
722
|
name = "thiserror-impl"
|
716
|
-
version = "1.0.
|
723
|
+
version = "1.0.69"
|
717
724
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
718
|
-
checksum = "
|
725
|
+
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
719
726
|
dependencies = [
|
720
727
|
"proc-macro2",
|
721
728
|
"quote",
|
@@ -724,19 +731,19 @@ dependencies = [
|
|
724
731
|
|
725
732
|
[[package]]
|
726
733
|
name = "tokenizers"
|
727
|
-
version = "0.5.
|
734
|
+
version = "0.5.4"
|
728
735
|
dependencies = [
|
729
736
|
"magnus",
|
730
737
|
"onig",
|
731
738
|
"serde",
|
732
|
-
"tokenizers 0.
|
739
|
+
"tokenizers 0.21.0",
|
733
740
|
]
|
734
741
|
|
735
742
|
[[package]]
|
736
743
|
name = "tokenizers"
|
737
|
-
version = "0.
|
744
|
+
version = "0.21.0"
|
738
745
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
739
|
-
checksum = "
|
746
|
+
checksum = "9ecededfed68a69bc657e486510089e255e53c3d38cc7d4d59c8742668ca2cae"
|
740
747
|
dependencies = [
|
741
748
|
"aho-corasick",
|
742
749
|
"derive_builder",
|
@@ -766,9 +773,9 @@ dependencies = [
|
|
766
773
|
|
767
774
|
[[package]]
|
768
775
|
name = "unicode-ident"
|
769
|
-
version = "1.0.
|
776
|
+
version = "1.0.14"
|
770
777
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
771
|
-
checksum = "
|
778
|
+
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
|
772
779
|
|
773
780
|
[[package]]
|
774
781
|
name = "unicode-normalization-alignments"
|
@@ -781,15 +788,15 @@ dependencies = [
|
|
781
788
|
|
782
789
|
[[package]]
|
783
790
|
name = "unicode-segmentation"
|
784
|
-
version = "1.
|
791
|
+
version = "1.12.0"
|
785
792
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
786
|
-
checksum = "
|
793
|
+
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
787
794
|
|
788
795
|
[[package]]
|
789
796
|
name = "unicode-width"
|
790
|
-
version = "0.
|
797
|
+
version = "0.2.0"
|
791
798
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
792
|
-
checksum = "
|
799
|
+
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
|
793
800
|
|
794
801
|
[[package]]
|
795
802
|
name = "unicode_categories"
|
@@ -803,11 +810,75 @@ version = "0.11.0+wasi-snapshot-preview1"
|
|
803
810
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
804
811
|
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
805
812
|
|
813
|
+
[[package]]
|
814
|
+
name = "wasm-bindgen"
|
815
|
+
version = "0.2.99"
|
816
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
817
|
+
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
|
818
|
+
dependencies = [
|
819
|
+
"cfg-if",
|
820
|
+
"once_cell",
|
821
|
+
"wasm-bindgen-macro",
|
822
|
+
]
|
823
|
+
|
824
|
+
[[package]]
|
825
|
+
name = "wasm-bindgen-backend"
|
826
|
+
version = "0.2.99"
|
827
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
828
|
+
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
|
829
|
+
dependencies = [
|
830
|
+
"bumpalo",
|
831
|
+
"log",
|
832
|
+
"proc-macro2",
|
833
|
+
"quote",
|
834
|
+
"syn",
|
835
|
+
"wasm-bindgen-shared",
|
836
|
+
]
|
837
|
+
|
838
|
+
[[package]]
|
839
|
+
name = "wasm-bindgen-macro"
|
840
|
+
version = "0.2.99"
|
841
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
842
|
+
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
|
843
|
+
dependencies = [
|
844
|
+
"quote",
|
845
|
+
"wasm-bindgen-macro-support",
|
846
|
+
]
|
847
|
+
|
848
|
+
[[package]]
|
849
|
+
name = "wasm-bindgen-macro-support"
|
850
|
+
version = "0.2.99"
|
851
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
852
|
+
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
|
853
|
+
dependencies = [
|
854
|
+
"proc-macro2",
|
855
|
+
"quote",
|
856
|
+
"syn",
|
857
|
+
"wasm-bindgen-backend",
|
858
|
+
"wasm-bindgen-shared",
|
859
|
+
]
|
860
|
+
|
861
|
+
[[package]]
|
862
|
+
name = "wasm-bindgen-shared"
|
863
|
+
version = "0.2.99"
|
864
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
865
|
+
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
|
866
|
+
|
867
|
+
[[package]]
|
868
|
+
name = "web-time"
|
869
|
+
version = "1.1.0"
|
870
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
871
|
+
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
872
|
+
dependencies = [
|
873
|
+
"js-sys",
|
874
|
+
"wasm-bindgen",
|
875
|
+
]
|
876
|
+
|
806
877
|
[[package]]
|
807
878
|
name = "windows-sys"
|
808
|
-
version = "0.
|
879
|
+
version = "0.59.0"
|
809
880
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
810
|
-
checksum = "
|
881
|
+
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
811
882
|
dependencies = [
|
812
883
|
"windows-targets",
|
813
884
|
]
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.5.
|
3
|
+
version = "0.5.4"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.
|
19
|
+
version = "=0.21.0" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -3,8 +3,8 @@ use std::sync::{Arc, RwLock};
|
|
3
3
|
use crate::pre_tokenizers::from_string;
|
4
4
|
use magnus::value::Lazy;
|
5
5
|
use magnus::{
|
6
|
-
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object,
|
7
|
-
Ruby, TypedData,
|
6
|
+
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object,
|
7
|
+
RClass, RModule, Ruby, TypedData,
|
8
8
|
};
|
9
9
|
use serde::{Deserialize, Serialize};
|
10
10
|
use tk::decoders::bpe::BPEDecoder;
|
@@ -16,11 +16,11 @@ use tk::decoders::metaspace::{Metaspace, PrependScheme};
|
|
16
16
|
use tk::decoders::strip::Strip;
|
17
17
|
use tk::decoders::wordpiece::WordPiece;
|
18
18
|
use tk::decoders::DecoderWrapper;
|
19
|
-
use tk::Decoder;
|
20
19
|
use tk::normalizers::replace::Replace;
|
20
|
+
use tk::Decoder;
|
21
21
|
|
22
22
|
use super::utils::*;
|
23
|
-
use super::{
|
23
|
+
use super::{RbError, RbResult, DECODERS};
|
24
24
|
|
25
25
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
26
26
|
pub struct RbDecoder {
|
@@ -106,7 +106,7 @@ impl RbDecoder {
|
|
106
106
|
}
|
107
107
|
|
108
108
|
fn strip_set_content(&self, content: char) {
|
109
|
-
setter!(self, Strip, content, content)
|
109
|
+
setter!(self, Strip, content, content);
|
110
110
|
}
|
111
111
|
|
112
112
|
fn strip_start(&self) -> usize {
|
@@ -114,7 +114,7 @@ impl RbDecoder {
|
|
114
114
|
}
|
115
115
|
|
116
116
|
fn strip_set_start(&self, start: usize) {
|
117
|
-
setter!(self, Strip, start, start)
|
117
|
+
setter!(self, Strip, start, start);
|
118
118
|
}
|
119
119
|
|
120
120
|
fn strip_stop(&self) -> usize {
|
@@ -122,7 +122,7 @@ impl RbDecoder {
|
|
122
122
|
}
|
123
123
|
|
124
124
|
fn strip_set_stop(&self, stop: usize) {
|
125
|
-
setter!(self, Strip, stop, stop)
|
125
|
+
setter!(self, Strip, stop, stop);
|
126
126
|
}
|
127
127
|
|
128
128
|
pub fn metaspace_replacement(&self) -> char {
|
@@ -228,7 +228,9 @@ pub struct RbReplaceDecoder {}
|
|
228
228
|
|
229
229
|
impl RbReplaceDecoder {
|
230
230
|
pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
|
231
|
-
Replace::new(pattern, content)
|
231
|
+
Replace::new(pattern, content)
|
232
|
+
.map(|v| v.into())
|
233
|
+
.map_err(RbError::from)
|
232
234
|
}
|
233
235
|
}
|
234
236
|
|
@@ -295,7 +297,8 @@ unsafe impl TypedData for RbDecoder {
|
|
295
297
|
}
|
296
298
|
|
297
299
|
fn data_type() -> &'static DataType {
|
298
|
-
static DATA_TYPE: DataType =
|
300
|
+
static DATA_TYPE: DataType =
|
301
|
+
data_type_builder!(RbDecoder, "Tokenizers::Decoders::Decoder").build();
|
299
302
|
&DATA_TYPE
|
300
303
|
}
|
301
304
|
|
@@ -383,18 +386,33 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
383
386
|
class.define_method("cleanup=", method!(RbDecoder::ctc_set_cleanup, 1))?;
|
384
387
|
class.define_method("pad_token", method!(RbDecoder::ctc_pad_token, 0))?;
|
385
388
|
class.define_method("pad_token=", method!(RbDecoder::ctc_set_pad_token, 1))?;
|
386
|
-
class.define_method(
|
387
|
-
|
389
|
+
class.define_method(
|
390
|
+
"word_delimiter_token",
|
391
|
+
method!(RbDecoder::ctc_word_delimiter_token, 0),
|
392
|
+
)?;
|
393
|
+
class.define_method(
|
394
|
+
"word_delimiter_token=",
|
395
|
+
method!(RbDecoder::ctc_set_word_delimiter_token, 1),
|
396
|
+
)?;
|
388
397
|
|
389
398
|
let class = module.define_class("Fuse", decoder)?;
|
390
399
|
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
391
400
|
|
392
401
|
let class = module.define_class("Metaspace", decoder)?;
|
393
402
|
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
|
394
|
-
class.define_method(
|
395
|
-
|
403
|
+
class.define_method(
|
404
|
+
"prepend_scheme",
|
405
|
+
method!(RbDecoder::metaspace_prepend_scheme, 0),
|
406
|
+
)?;
|
407
|
+
class.define_method(
|
408
|
+
"prepend_scheme=",
|
409
|
+
method!(RbDecoder::metaspace_set_prepend_scheme, 1),
|
410
|
+
)?;
|
396
411
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
397
|
-
class.define_method(
|
412
|
+
class.define_method(
|
413
|
+
"replacement=",
|
414
|
+
method!(RbDecoder::metaspace_set_replacement, 1),
|
415
|
+
)?;
|
398
416
|
class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
|
399
417
|
class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
|
400
418
|
|
data/ext/tokenizers/src/error.rs
CHANGED
@@ -9,9 +9,14 @@ impl RbError {
|
|
9
9
|
pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
|
10
10
|
Error::new(error(), e.to_string())
|
11
11
|
}
|
12
|
+
|
13
|
+
pub fn new_err(s: String) -> Error {
|
14
|
+
Error::new(error(), s)
|
15
|
+
}
|
12
16
|
}
|
13
17
|
|
14
|
-
static ERROR: Lazy<ExceptionClass> =
|
18
|
+
static ERROR: Lazy<ExceptionClass> =
|
19
|
+
Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Error").unwrap());
|
15
20
|
|
16
21
|
fn error() -> ExceptionClass {
|
17
22
|
Ruby::get().unwrap().get_inner(&ERROR)
|