tokenizers 0.5.3 → 0.5.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +154 -83
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/decoders.rs +32 -14
- data/ext/tokenizers/src/error.rs +6 -1
- data/ext/tokenizers/src/lib.rs +37 -12
- data/ext/tokenizers/src/models.rs +75 -23
- data/ext/tokenizers/src/normalizers.rs +84 -24
- data/ext/tokenizers/src/pre_tokenizers.rs +121 -42
- data/ext/tokenizers/src/processors.rs +22 -10
- data/ext/tokenizers/src/tokenizer.rs +63 -34
- data/ext/tokenizers/src/trainers.rs +215 -56
- data/ext/tokenizers/src/utils/regex.rs +6 -4
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8394d394a6ebaa502c53d08508586044c1f6e0ea8cd8c6629e6a7c2bed38518e
|
4
|
+
data.tar.gz: 1b54fa285fb6799c2cbc411c21c0f951db1fdacc3291b1da1971f420ada07820
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f75af568151b9aa3fb9b57c2020464e5d1270174a25adff113f9058b8a92c11288f4df22c1f382af47c9312c1cca83f90214b1875a37334e232bd35bbfd4785e
|
7
|
+
data.tar.gz: a5b9d665dd2f985f03ea1056887169318c679a4fd30bc768f00109bd2d225958233f57f254a7c1e505c0b3b3fdb94b8a2fcd4027c55405088c6c1e88e3fd24be
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -19,9 +19,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
19
19
|
|
20
20
|
[[package]]
|
21
21
|
name = "bindgen"
|
22
|
-
version = "0.69.
|
22
|
+
version = "0.69.5"
|
23
23
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
24
|
-
checksum = "
|
24
|
+
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
|
25
25
|
dependencies = [
|
26
26
|
"bitflags 2.6.0",
|
27
27
|
"cexpr",
|
@@ -49,6 +49,12 @@ version = "2.6.0"
|
|
49
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
50
|
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
51
51
|
|
52
|
+
[[package]]
|
53
|
+
name = "bumpalo"
|
54
|
+
version = "3.16.0"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
57
|
+
|
52
58
|
[[package]]
|
53
59
|
name = "byteorder"
|
54
60
|
version = "1.5.0"
|
@@ -57,9 +63,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
|
57
63
|
|
58
64
|
[[package]]
|
59
65
|
name = "cc"
|
60
|
-
version = "1.
|
66
|
+
version = "1.2.6"
|
61
67
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
68
|
+
checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333"
|
63
69
|
dependencies = [
|
64
70
|
"shlex",
|
65
71
|
]
|
@@ -92,22 +98,22 @@ dependencies = [
|
|
92
98
|
|
93
99
|
[[package]]
|
94
100
|
name = "console"
|
95
|
-
version = "0.15.
|
101
|
+
version = "0.15.10"
|
96
102
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
97
|
-
checksum = "
|
103
|
+
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
|
98
104
|
dependencies = [
|
99
105
|
"encode_unicode",
|
100
|
-
"lazy_static",
|
101
106
|
"libc",
|
107
|
+
"once_cell",
|
102
108
|
"unicode-width",
|
103
109
|
"windows-sys",
|
104
110
|
]
|
105
111
|
|
106
112
|
[[package]]
|
107
113
|
name = "crossbeam-deque"
|
108
|
-
version = "0.8.
|
114
|
+
version = "0.8.6"
|
109
115
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
110
|
-
checksum = "
|
116
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
111
117
|
dependencies = [
|
112
118
|
"crossbeam-epoch",
|
113
119
|
"crossbeam-utils",
|
@@ -124,9 +130,9 @@ dependencies = [
|
|
124
130
|
|
125
131
|
[[package]]
|
126
132
|
name = "crossbeam-utils"
|
127
|
-
version = "0.8.
|
133
|
+
version = "0.8.21"
|
128
134
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
129
|
-
checksum = "
|
135
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
130
136
|
|
131
137
|
[[package]]
|
132
138
|
name = "darling"
|
@@ -165,18 +171,18 @@ dependencies = [
|
|
165
171
|
|
166
172
|
[[package]]
|
167
173
|
name = "derive_builder"
|
168
|
-
version = "0.20.
|
174
|
+
version = "0.20.2"
|
169
175
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
170
|
-
checksum = "
|
176
|
+
checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
|
171
177
|
dependencies = [
|
172
178
|
"derive_builder_macro",
|
173
179
|
]
|
174
180
|
|
175
181
|
[[package]]
|
176
182
|
name = "derive_builder_core"
|
177
|
-
version = "0.20.
|
183
|
+
version = "0.20.2"
|
178
184
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
179
|
-
checksum = "
|
185
|
+
checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
|
180
186
|
dependencies = [
|
181
187
|
"darling",
|
182
188
|
"proc-macro2",
|
@@ -186,9 +192,9 @@ dependencies = [
|
|
186
192
|
|
187
193
|
[[package]]
|
188
194
|
name = "derive_builder_macro"
|
189
|
-
version = "0.20.
|
195
|
+
version = "0.20.2"
|
190
196
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
191
|
-
checksum = "
|
197
|
+
checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
|
192
198
|
dependencies = [
|
193
199
|
"derive_builder_core",
|
194
200
|
"syn",
|
@@ -202,9 +208,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
|
202
208
|
|
203
209
|
[[package]]
|
204
210
|
name = "encode_unicode"
|
205
|
-
version = "0.
|
211
|
+
version = "1.0.0"
|
206
212
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
207
|
-
checksum = "
|
213
|
+
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
208
214
|
|
209
215
|
[[package]]
|
210
216
|
name = "esaxx-rs"
|
@@ -234,9 +240,9 @@ dependencies = [
|
|
234
240
|
|
235
241
|
[[package]]
|
236
242
|
name = "glob"
|
237
|
-
version = "0.3.
|
243
|
+
version = "0.3.2"
|
238
244
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
239
|
-
checksum = "
|
245
|
+
checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
|
240
246
|
|
241
247
|
[[package]]
|
242
248
|
name = "ident_case"
|
@@ -246,24 +252,15 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|
246
252
|
|
247
253
|
[[package]]
|
248
254
|
name = "indicatif"
|
249
|
-
version = "0.17.
|
255
|
+
version = "0.17.9"
|
250
256
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
251
|
-
checksum = "
|
257
|
+
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
|
252
258
|
dependencies = [
|
253
259
|
"console",
|
254
|
-
"instant",
|
255
260
|
"number_prefix",
|
256
261
|
"portable-atomic",
|
257
262
|
"unicode-width",
|
258
|
-
|
259
|
-
|
260
|
-
[[package]]
|
261
|
-
name = "instant"
|
262
|
-
version = "0.1.13"
|
263
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
264
|
-
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
265
|
-
dependencies = [
|
266
|
-
"cfg-if",
|
263
|
+
"web-time",
|
267
264
|
]
|
268
265
|
|
269
266
|
[[package]]
|
@@ -286,9 +283,19 @@ dependencies = [
|
|
286
283
|
|
287
284
|
[[package]]
|
288
285
|
name = "itoa"
|
289
|
-
version = "1.0.
|
286
|
+
version = "1.0.14"
|
290
287
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
291
|
-
checksum = "
|
288
|
+
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
289
|
+
|
290
|
+
[[package]]
|
291
|
+
name = "js-sys"
|
292
|
+
version = "0.3.76"
|
293
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
294
|
+
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
|
295
|
+
dependencies = [
|
296
|
+
"once_cell",
|
297
|
+
"wasm-bindgen",
|
298
|
+
]
|
292
299
|
|
293
300
|
[[package]]
|
294
301
|
name = "lazy_static"
|
@@ -304,15 +311,15 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
304
311
|
|
305
312
|
[[package]]
|
306
313
|
name = "libc"
|
307
|
-
version = "0.2.
|
314
|
+
version = "0.2.169"
|
308
315
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
309
|
-
checksum = "
|
316
|
+
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
310
317
|
|
311
318
|
[[package]]
|
312
319
|
name = "libloading"
|
313
|
-
version = "0.8.
|
320
|
+
version = "0.8.6"
|
314
321
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
315
|
-
checksum = "
|
322
|
+
checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
|
316
323
|
dependencies = [
|
317
324
|
"cfg-if",
|
318
325
|
"windows-targets",
|
@@ -414,9 +421,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
|
414
421
|
|
415
422
|
[[package]]
|
416
423
|
name = "once_cell"
|
417
|
-
version = "1.
|
424
|
+
version = "1.20.2"
|
418
425
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
419
|
-
checksum = "
|
426
|
+
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
420
427
|
|
421
428
|
[[package]]
|
422
429
|
name = "onig"
|
@@ -448,15 +455,15 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
|
448
455
|
|
449
456
|
[[package]]
|
450
457
|
name = "pkg-config"
|
451
|
-
version = "0.3.
|
458
|
+
version = "0.3.31"
|
452
459
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
453
|
-
checksum = "
|
460
|
+
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
454
461
|
|
455
462
|
[[package]]
|
456
463
|
name = "portable-atomic"
|
457
|
-
version = "1.
|
464
|
+
version = "1.10.0"
|
458
465
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
459
|
-
checksum = "
|
466
|
+
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
460
467
|
|
461
468
|
[[package]]
|
462
469
|
name = "ppv-lite86"
|
@@ -469,18 +476,18 @@ dependencies = [
|
|
469
476
|
|
470
477
|
[[package]]
|
471
478
|
name = "proc-macro2"
|
472
|
-
version = "1.0.
|
479
|
+
version = "1.0.92"
|
473
480
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
474
|
-
checksum = "
|
481
|
+
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
|
475
482
|
dependencies = [
|
476
483
|
"unicode-ident",
|
477
484
|
]
|
478
485
|
|
479
486
|
[[package]]
|
480
487
|
name = "quote"
|
481
|
-
version = "1.0.
|
488
|
+
version = "1.0.38"
|
482
489
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
483
|
-
checksum = "
|
490
|
+
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
|
484
491
|
dependencies = [
|
485
492
|
"proc-macro2",
|
486
493
|
]
|
@@ -548,18 +555,18 @@ dependencies = [
|
|
548
555
|
|
549
556
|
[[package]]
|
550
557
|
name = "rb-sys"
|
551
|
-
version = "0.9.
|
558
|
+
version = "0.9.105"
|
552
559
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
553
|
-
checksum = "
|
560
|
+
checksum = "4b3a1f3ce8e7c36d777d52fe7a99039fe4fea7c8ec355a4c4f3a17f92a14029f"
|
554
561
|
dependencies = [
|
555
562
|
"rb-sys-build",
|
556
563
|
]
|
557
564
|
|
558
565
|
[[package]]
|
559
566
|
name = "rb-sys-build"
|
560
|
-
version = "0.9.
|
567
|
+
version = "0.9.105"
|
561
568
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
562
|
-
checksum = "
|
569
|
+
checksum = "3e6b246c29c0809e1cbe60a1ba9e093da72a4676d02adc68469297d1e589bbf0"
|
563
570
|
dependencies = [
|
564
571
|
"bindgen",
|
565
572
|
"lazy_static",
|
@@ -578,9 +585,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
578
585
|
|
579
586
|
[[package]]
|
580
587
|
name = "regex"
|
581
|
-
version = "1.
|
588
|
+
version = "1.11.1"
|
582
589
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
583
|
-
checksum = "
|
590
|
+
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
584
591
|
dependencies = [
|
585
592
|
"aho-corasick",
|
586
593
|
"memchr",
|
@@ -590,9 +597,9 @@ dependencies = [
|
|
590
597
|
|
591
598
|
[[package]]
|
592
599
|
name = "regex-automata"
|
593
|
-
version = "0.4.
|
600
|
+
version = "0.4.9"
|
594
601
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
595
|
-
checksum = "
|
602
|
+
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
|
596
603
|
dependencies = [
|
597
604
|
"aho-corasick",
|
598
605
|
"memchr",
|
@@ -601,9 +608,9 @@ dependencies = [
|
|
601
608
|
|
602
609
|
[[package]]
|
603
610
|
name = "regex-syntax"
|
604
|
-
version = "0.8.
|
611
|
+
version = "0.8.5"
|
605
612
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
606
|
-
checksum = "
|
613
|
+
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
607
614
|
|
608
615
|
[[package]]
|
609
616
|
name = "rustc-hash"
|
@@ -625,18 +632,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
625
632
|
|
626
633
|
[[package]]
|
627
634
|
name = "serde"
|
628
|
-
version = "1.0.
|
635
|
+
version = "1.0.217"
|
629
636
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
630
|
-
checksum = "
|
637
|
+
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
|
631
638
|
dependencies = [
|
632
639
|
"serde_derive",
|
633
640
|
]
|
634
641
|
|
635
642
|
[[package]]
|
636
643
|
name = "serde_derive"
|
637
|
-
version = "1.0.
|
644
|
+
version = "1.0.217"
|
638
645
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
639
|
-
checksum = "
|
646
|
+
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
|
640
647
|
dependencies = [
|
641
648
|
"proc-macro2",
|
642
649
|
"quote",
|
@@ -645,9 +652,9 @@ dependencies = [
|
|
645
652
|
|
646
653
|
[[package]]
|
647
654
|
name = "serde_json"
|
648
|
-
version = "1.0.
|
655
|
+
version = "1.0.134"
|
649
656
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
650
|
-
checksum = "
|
657
|
+
checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d"
|
651
658
|
dependencies = [
|
652
659
|
"itoa",
|
653
660
|
"memchr",
|
@@ -693,9 +700,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
693
700
|
|
694
701
|
[[package]]
|
695
702
|
name = "syn"
|
696
|
-
version = "2.0.
|
703
|
+
version = "2.0.93"
|
697
704
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
698
|
-
checksum = "
|
705
|
+
checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058"
|
699
706
|
dependencies = [
|
700
707
|
"proc-macro2",
|
701
708
|
"quote",
|
@@ -704,18 +711,18 @@ dependencies = [
|
|
704
711
|
|
705
712
|
[[package]]
|
706
713
|
name = "thiserror"
|
707
|
-
version = "1.0.
|
714
|
+
version = "1.0.69"
|
708
715
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
709
|
-
checksum = "
|
716
|
+
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
710
717
|
dependencies = [
|
711
718
|
"thiserror-impl",
|
712
719
|
]
|
713
720
|
|
714
721
|
[[package]]
|
715
722
|
name = "thiserror-impl"
|
716
|
-
version = "1.0.
|
723
|
+
version = "1.0.69"
|
717
724
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
718
|
-
checksum = "
|
725
|
+
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
719
726
|
dependencies = [
|
720
727
|
"proc-macro2",
|
721
728
|
"quote",
|
@@ -724,19 +731,19 @@ dependencies = [
|
|
724
731
|
|
725
732
|
[[package]]
|
726
733
|
name = "tokenizers"
|
727
|
-
version = "0.5.
|
734
|
+
version = "0.5.4"
|
728
735
|
dependencies = [
|
729
736
|
"magnus",
|
730
737
|
"onig",
|
731
738
|
"serde",
|
732
|
-
"tokenizers 0.
|
739
|
+
"tokenizers 0.21.0",
|
733
740
|
]
|
734
741
|
|
735
742
|
[[package]]
|
736
743
|
name = "tokenizers"
|
737
|
-
version = "0.
|
744
|
+
version = "0.21.0"
|
738
745
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
739
|
-
checksum = "
|
746
|
+
checksum = "9ecededfed68a69bc657e486510089e255e53c3d38cc7d4d59c8742668ca2cae"
|
740
747
|
dependencies = [
|
741
748
|
"aho-corasick",
|
742
749
|
"derive_builder",
|
@@ -766,9 +773,9 @@ dependencies = [
|
|
766
773
|
|
767
774
|
[[package]]
|
768
775
|
name = "unicode-ident"
|
769
|
-
version = "1.0.
|
776
|
+
version = "1.0.14"
|
770
777
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
771
|
-
checksum = "
|
778
|
+
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
|
772
779
|
|
773
780
|
[[package]]
|
774
781
|
name = "unicode-normalization-alignments"
|
@@ -781,15 +788,15 @@ dependencies = [
|
|
781
788
|
|
782
789
|
[[package]]
|
783
790
|
name = "unicode-segmentation"
|
784
|
-
version = "1.
|
791
|
+
version = "1.12.0"
|
785
792
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
786
|
-
checksum = "
|
793
|
+
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
787
794
|
|
788
795
|
[[package]]
|
789
796
|
name = "unicode-width"
|
790
|
-
version = "0.
|
797
|
+
version = "0.2.0"
|
791
798
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
792
|
-
checksum = "
|
799
|
+
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
|
793
800
|
|
794
801
|
[[package]]
|
795
802
|
name = "unicode_categories"
|
@@ -803,11 +810,75 @@ version = "0.11.0+wasi-snapshot-preview1"
|
|
803
810
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
804
811
|
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
805
812
|
|
813
|
+
[[package]]
|
814
|
+
name = "wasm-bindgen"
|
815
|
+
version = "0.2.99"
|
816
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
817
|
+
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
|
818
|
+
dependencies = [
|
819
|
+
"cfg-if",
|
820
|
+
"once_cell",
|
821
|
+
"wasm-bindgen-macro",
|
822
|
+
]
|
823
|
+
|
824
|
+
[[package]]
|
825
|
+
name = "wasm-bindgen-backend"
|
826
|
+
version = "0.2.99"
|
827
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
828
|
+
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
|
829
|
+
dependencies = [
|
830
|
+
"bumpalo",
|
831
|
+
"log",
|
832
|
+
"proc-macro2",
|
833
|
+
"quote",
|
834
|
+
"syn",
|
835
|
+
"wasm-bindgen-shared",
|
836
|
+
]
|
837
|
+
|
838
|
+
[[package]]
|
839
|
+
name = "wasm-bindgen-macro"
|
840
|
+
version = "0.2.99"
|
841
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
842
|
+
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
|
843
|
+
dependencies = [
|
844
|
+
"quote",
|
845
|
+
"wasm-bindgen-macro-support",
|
846
|
+
]
|
847
|
+
|
848
|
+
[[package]]
|
849
|
+
name = "wasm-bindgen-macro-support"
|
850
|
+
version = "0.2.99"
|
851
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
852
|
+
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
|
853
|
+
dependencies = [
|
854
|
+
"proc-macro2",
|
855
|
+
"quote",
|
856
|
+
"syn",
|
857
|
+
"wasm-bindgen-backend",
|
858
|
+
"wasm-bindgen-shared",
|
859
|
+
]
|
860
|
+
|
861
|
+
[[package]]
|
862
|
+
name = "wasm-bindgen-shared"
|
863
|
+
version = "0.2.99"
|
864
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
865
|
+
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
|
866
|
+
|
867
|
+
[[package]]
|
868
|
+
name = "web-time"
|
869
|
+
version = "1.1.0"
|
870
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
871
|
+
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
872
|
+
dependencies = [
|
873
|
+
"js-sys",
|
874
|
+
"wasm-bindgen",
|
875
|
+
]
|
876
|
+
|
806
877
|
[[package]]
|
807
878
|
name = "windows-sys"
|
808
|
-
version = "0.
|
879
|
+
version = "0.59.0"
|
809
880
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
810
|
-
checksum = "
|
881
|
+
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
811
882
|
dependencies = [
|
812
883
|
"windows-targets",
|
813
884
|
]
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.5.
|
3
|
+
version = "0.5.4"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.
|
19
|
+
version = "=0.21.0" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -3,8 +3,8 @@ use std::sync::{Arc, RwLock};
|
|
3
3
|
use crate::pre_tokenizers::from_string;
|
4
4
|
use magnus::value::Lazy;
|
5
5
|
use magnus::{
|
6
|
-
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object,
|
7
|
-
Ruby, TypedData,
|
6
|
+
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object,
|
7
|
+
RClass, RModule, Ruby, TypedData,
|
8
8
|
};
|
9
9
|
use serde::{Deserialize, Serialize};
|
10
10
|
use tk::decoders::bpe::BPEDecoder;
|
@@ -16,11 +16,11 @@ use tk::decoders::metaspace::{Metaspace, PrependScheme};
|
|
16
16
|
use tk::decoders::strip::Strip;
|
17
17
|
use tk::decoders::wordpiece::WordPiece;
|
18
18
|
use tk::decoders::DecoderWrapper;
|
19
|
-
use tk::Decoder;
|
20
19
|
use tk::normalizers::replace::Replace;
|
20
|
+
use tk::Decoder;
|
21
21
|
|
22
22
|
use super::utils::*;
|
23
|
-
use super::{
|
23
|
+
use super::{RbError, RbResult, DECODERS};
|
24
24
|
|
25
25
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
26
26
|
pub struct RbDecoder {
|
@@ -106,7 +106,7 @@ impl RbDecoder {
|
|
106
106
|
}
|
107
107
|
|
108
108
|
fn strip_set_content(&self, content: char) {
|
109
|
-
setter!(self, Strip, content, content)
|
109
|
+
setter!(self, Strip, content, content);
|
110
110
|
}
|
111
111
|
|
112
112
|
fn strip_start(&self) -> usize {
|
@@ -114,7 +114,7 @@ impl RbDecoder {
|
|
114
114
|
}
|
115
115
|
|
116
116
|
fn strip_set_start(&self, start: usize) {
|
117
|
-
setter!(self, Strip, start, start)
|
117
|
+
setter!(self, Strip, start, start);
|
118
118
|
}
|
119
119
|
|
120
120
|
fn strip_stop(&self) -> usize {
|
@@ -122,7 +122,7 @@ impl RbDecoder {
|
|
122
122
|
}
|
123
123
|
|
124
124
|
fn strip_set_stop(&self, stop: usize) {
|
125
|
-
setter!(self, Strip, stop, stop)
|
125
|
+
setter!(self, Strip, stop, stop);
|
126
126
|
}
|
127
127
|
|
128
128
|
pub fn metaspace_replacement(&self) -> char {
|
@@ -228,7 +228,9 @@ pub struct RbReplaceDecoder {}
|
|
228
228
|
|
229
229
|
impl RbReplaceDecoder {
|
230
230
|
pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
|
231
|
-
Replace::new(pattern, content)
|
231
|
+
Replace::new(pattern, content)
|
232
|
+
.map(|v| v.into())
|
233
|
+
.map_err(RbError::from)
|
232
234
|
}
|
233
235
|
}
|
234
236
|
|
@@ -295,7 +297,8 @@ unsafe impl TypedData for RbDecoder {
|
|
295
297
|
}
|
296
298
|
|
297
299
|
fn data_type() -> &'static DataType {
|
298
|
-
static DATA_TYPE: DataType =
|
300
|
+
static DATA_TYPE: DataType =
|
301
|
+
data_type_builder!(RbDecoder, "Tokenizers::Decoders::Decoder").build();
|
299
302
|
&DATA_TYPE
|
300
303
|
}
|
301
304
|
|
@@ -383,18 +386,33 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
383
386
|
class.define_method("cleanup=", method!(RbDecoder::ctc_set_cleanup, 1))?;
|
384
387
|
class.define_method("pad_token", method!(RbDecoder::ctc_pad_token, 0))?;
|
385
388
|
class.define_method("pad_token=", method!(RbDecoder::ctc_set_pad_token, 1))?;
|
386
|
-
class.define_method(
|
387
|
-
|
389
|
+
class.define_method(
|
390
|
+
"word_delimiter_token",
|
391
|
+
method!(RbDecoder::ctc_word_delimiter_token, 0),
|
392
|
+
)?;
|
393
|
+
class.define_method(
|
394
|
+
"word_delimiter_token=",
|
395
|
+
method!(RbDecoder::ctc_set_word_delimiter_token, 1),
|
396
|
+
)?;
|
388
397
|
|
389
398
|
let class = module.define_class("Fuse", decoder)?;
|
390
399
|
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
391
400
|
|
392
401
|
let class = module.define_class("Metaspace", decoder)?;
|
393
402
|
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
|
394
|
-
class.define_method(
|
395
|
-
|
403
|
+
class.define_method(
|
404
|
+
"prepend_scheme",
|
405
|
+
method!(RbDecoder::metaspace_prepend_scheme, 0),
|
406
|
+
)?;
|
407
|
+
class.define_method(
|
408
|
+
"prepend_scheme=",
|
409
|
+
method!(RbDecoder::metaspace_set_prepend_scheme, 1),
|
410
|
+
)?;
|
396
411
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
397
|
-
class.define_method(
|
412
|
+
class.define_method(
|
413
|
+
"replacement=",
|
414
|
+
method!(RbDecoder::metaspace_set_replacement, 1),
|
415
|
+
)?;
|
398
416
|
class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
|
399
417
|
class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
|
400
418
|
|
data/ext/tokenizers/src/error.rs
CHANGED
@@ -9,9 +9,14 @@ impl RbError {
|
|
9
9
|
pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
|
10
10
|
Error::new(error(), e.to_string())
|
11
11
|
}
|
12
|
+
|
13
|
+
pub fn new_err(s: String) -> Error {
|
14
|
+
Error::new(error(), s)
|
15
|
+
}
|
12
16
|
}
|
13
17
|
|
14
|
-
static ERROR: Lazy<ExceptionClass> =
|
18
|
+
static ERROR: Lazy<ExceptionClass> =
|
19
|
+
Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Error").unwrap());
|
15
20
|
|
16
21
|
fn error() -> ExceptionClass {
|
17
22
|
Ruby::get().unwrap().get_inner(&ERROR)
|