tokenizers 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +162 -175
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/normalizers.rs +2 -2
- data/ext/tokenizers/src/pre_tokenizers.rs +2 -2
- data/ext/tokenizers/src/tokenizer.rs +11 -11
- data/ext/tokenizers/src/trainers.rs +16 -16
- data/lib/tokenizers/from_pretrained.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4846b5d3dc0fe8f5828ddffe46908b1f3812ebf6a03a939ca0395ad7748533bb
|
4
|
+
data.tar.gz: 259795bfa6b13a36f62ab2ffb65e9feabd460e01efe9f59e7d6017c6dcd9b9b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90f55feb8ceec81815bb61b7773e6f67924eb6d47869e5da67d579e2ac9df6a48fddee5e97f5a028e3fa1e39941f3bfe6ec6c04cf49fd1b87f18bade54911231
|
7
|
+
data.tar.gz: 9c1895b43222494b393f3fbddaa6e78216e025f3d3dddebf0c0311d2d897b282a16b8b0044aacb2466790b0a93c8d01099b25d662750b96d5798d0a4a927267b
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -4,19 +4,13 @@ version = 3
|
|
4
4
|
|
5
5
|
[[package]]
|
6
6
|
name = "aho-corasick"
|
7
|
-
version = "1.1.
|
7
|
+
version = "1.1.3"
|
8
8
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
|
-
checksum = "
|
9
|
+
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
10
10
|
dependencies = [
|
11
11
|
"memchr",
|
12
12
|
]
|
13
13
|
|
14
|
-
[[package]]
|
15
|
-
name = "autocfg"
|
16
|
-
version = "1.1.0"
|
17
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
18
|
-
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
19
|
-
|
20
14
|
[[package]]
|
21
15
|
name = "base64"
|
22
16
|
version = "0.13.1"
|
@@ -25,16 +19,16 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
19
|
|
26
20
|
[[package]]
|
27
21
|
name = "bindgen"
|
28
|
-
version = "0.69.
|
22
|
+
version = "0.69.4"
|
29
23
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
24
|
+
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
|
31
25
|
dependencies = [
|
32
|
-
"bitflags 2.
|
26
|
+
"bitflags 2.6.0",
|
33
27
|
"cexpr",
|
34
28
|
"clang-sys",
|
29
|
+
"itertools 0.12.1",
|
35
30
|
"lazy_static",
|
36
31
|
"lazycell",
|
37
|
-
"peeking_take_while",
|
38
32
|
"proc-macro2",
|
39
33
|
"quote",
|
40
34
|
"regex",
|
@@ -51,15 +45,21 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
51
45
|
|
52
46
|
[[package]]
|
53
47
|
name = "bitflags"
|
54
|
-
version = "2.
|
48
|
+
version = "2.6.0"
|
55
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
-
checksum = "
|
50
|
+
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
51
|
+
|
52
|
+
[[package]]
|
53
|
+
name = "byteorder"
|
54
|
+
version = "1.5.0"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
57
57
|
|
58
58
|
[[package]]
|
59
59
|
name = "cc"
|
60
|
-
version = "1.
|
60
|
+
version = "1.1.8"
|
61
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
62
|
+
checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549"
|
63
63
|
|
64
64
|
[[package]]
|
65
65
|
name = "cexpr"
|
@@ -78,9 +78,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
78
78
|
|
79
79
|
[[package]]
|
80
80
|
name = "clang-sys"
|
81
|
-
version = "1.
|
81
|
+
version = "1.8.1"
|
82
82
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
83
|
-
checksum = "
|
83
|
+
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
|
84
84
|
dependencies = [
|
85
85
|
"glob",
|
86
86
|
"libc",
|
@@ -89,9 +89,9 @@ dependencies = [
|
|
89
89
|
|
90
90
|
[[package]]
|
91
91
|
name = "console"
|
92
|
-
version = "0.15.
|
92
|
+
version = "0.15.8"
|
93
93
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
94
|
-
checksum = "
|
94
|
+
checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
|
95
95
|
dependencies = [
|
96
96
|
"encode_unicode",
|
97
97
|
"lazy_static",
|
@@ -102,42 +102,34 @@ dependencies = [
|
|
102
102
|
|
103
103
|
[[package]]
|
104
104
|
name = "crossbeam-deque"
|
105
|
-
version = "0.8.
|
105
|
+
version = "0.8.5"
|
106
106
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
107
|
-
checksum = "
|
107
|
+
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
|
108
108
|
dependencies = [
|
109
|
-
"cfg-if",
|
110
109
|
"crossbeam-epoch",
|
111
110
|
"crossbeam-utils",
|
112
111
|
]
|
113
112
|
|
114
113
|
[[package]]
|
115
114
|
name = "crossbeam-epoch"
|
116
|
-
version = "0.9.
|
115
|
+
version = "0.9.18"
|
117
116
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
118
|
-
checksum = "
|
117
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
119
118
|
dependencies = [
|
120
|
-
"autocfg",
|
121
|
-
"cfg-if",
|
122
119
|
"crossbeam-utils",
|
123
|
-
"memoffset",
|
124
|
-
"scopeguard",
|
125
120
|
]
|
126
121
|
|
127
122
|
[[package]]
|
128
123
|
name = "crossbeam-utils"
|
129
|
-
version = "0.8.
|
124
|
+
version = "0.8.20"
|
130
125
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
131
|
-
checksum = "
|
132
|
-
dependencies = [
|
133
|
-
"cfg-if",
|
134
|
-
]
|
126
|
+
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
|
135
127
|
|
136
128
|
[[package]]
|
137
129
|
name = "darling"
|
138
|
-
version = "0.20.
|
130
|
+
version = "0.20.10"
|
139
131
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
140
|
-
checksum = "
|
132
|
+
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
|
141
133
|
dependencies = [
|
142
134
|
"darling_core",
|
143
135
|
"darling_macro",
|
@@ -145,9 +137,9 @@ dependencies = [
|
|
145
137
|
|
146
138
|
[[package]]
|
147
139
|
name = "darling_core"
|
148
|
-
version = "0.20.
|
140
|
+
version = "0.20.10"
|
149
141
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
150
|
-
checksum = "
|
142
|
+
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
|
151
143
|
dependencies = [
|
152
144
|
"fnv",
|
153
145
|
"ident_case",
|
@@ -159,9 +151,9 @@ dependencies = [
|
|
159
151
|
|
160
152
|
[[package]]
|
161
153
|
name = "darling_macro"
|
162
|
-
version = "0.20.
|
154
|
+
version = "0.20.10"
|
163
155
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
164
|
-
checksum = "
|
156
|
+
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
165
157
|
dependencies = [
|
166
158
|
"darling_core",
|
167
159
|
"quote",
|
@@ -201,9 +193,9 @@ dependencies = [
|
|
201
193
|
|
202
194
|
[[package]]
|
203
195
|
name = "either"
|
204
|
-
version = "1.
|
196
|
+
version = "1.13.0"
|
205
197
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
206
|
-
checksum = "
|
198
|
+
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
207
199
|
|
208
200
|
[[package]]
|
209
201
|
name = "encode_unicode"
|
@@ -228,9 +220,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
228
220
|
|
229
221
|
[[package]]
|
230
222
|
name = "getrandom"
|
231
|
-
version = "0.2.
|
223
|
+
version = "0.2.15"
|
232
224
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
233
|
-
checksum = "
|
225
|
+
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
234
226
|
dependencies = [
|
235
227
|
"cfg-if",
|
236
228
|
"libc",
|
@@ -251,9 +243,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|
251
243
|
|
252
244
|
[[package]]
|
253
245
|
name = "indicatif"
|
254
|
-
version = "0.17.
|
246
|
+
version = "0.17.8"
|
255
247
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
256
|
-
checksum = "
|
248
|
+
checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
|
257
249
|
dependencies = [
|
258
250
|
"console",
|
259
251
|
"instant",
|
@@ -264,9 +256,9 @@ dependencies = [
|
|
264
256
|
|
265
257
|
[[package]]
|
266
258
|
name = "instant"
|
267
|
-
version = "0.1.
|
259
|
+
version = "0.1.13"
|
268
260
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
269
|
-
checksum = "
|
261
|
+
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
270
262
|
dependencies = [
|
271
263
|
"cfg-if",
|
272
264
|
]
|
@@ -291,15 +283,15 @@ dependencies = [
|
|
291
283
|
|
292
284
|
[[package]]
|
293
285
|
name = "itoa"
|
294
|
-
version = "1.0.
|
286
|
+
version = "1.0.11"
|
295
287
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
296
|
-
checksum = "
|
288
|
+
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
297
289
|
|
298
290
|
[[package]]
|
299
291
|
name = "lazy_static"
|
300
|
-
version = "1.
|
292
|
+
version = "1.5.0"
|
301
293
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
302
|
-
checksum = "
|
294
|
+
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
303
295
|
|
304
296
|
[[package]]
|
305
297
|
name = "lazycell"
|
@@ -309,28 +301,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
309
301
|
|
310
302
|
[[package]]
|
311
303
|
name = "libc"
|
312
|
-
version = "0.2.
|
304
|
+
version = "0.2.155"
|
313
305
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
314
|
-
checksum = "
|
306
|
+
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
|
315
307
|
|
316
308
|
[[package]]
|
317
309
|
name = "libloading"
|
318
|
-
version = "0.
|
310
|
+
version = "0.8.5"
|
319
311
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
320
|
-
checksum = "
|
312
|
+
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
|
321
313
|
dependencies = [
|
322
314
|
"cfg-if",
|
323
|
-
"
|
315
|
+
"windows-targets",
|
324
316
|
]
|
325
317
|
|
326
318
|
[[package]]
|
327
319
|
name = "log"
|
328
|
-
version = "0.4.
|
320
|
+
version = "0.4.22"
|
329
321
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
330
|
-
checksum = "
|
331
|
-
dependencies = [
|
332
|
-
"cfg-if",
|
333
|
-
]
|
322
|
+
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
334
323
|
|
335
324
|
[[package]]
|
336
325
|
name = "macro_rules_attribute"
|
@@ -350,9 +339,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
350
339
|
|
351
340
|
[[package]]
|
352
341
|
name = "magnus"
|
353
|
-
version = "0.
|
342
|
+
version = "0.7.1"
|
354
343
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
355
|
-
checksum = "
|
344
|
+
checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
|
356
345
|
dependencies = [
|
357
346
|
"magnus-macros",
|
358
347
|
"rb-sys",
|
@@ -373,18 +362,9 @@ dependencies = [
|
|
373
362
|
|
374
363
|
[[package]]
|
375
364
|
name = "memchr"
|
376
|
-
version = "2.
|
365
|
+
version = "2.7.4"
|
377
366
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
378
|
-
checksum = "
|
379
|
-
|
380
|
-
[[package]]
|
381
|
-
name = "memoffset"
|
382
|
-
version = "0.8.0"
|
383
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
384
|
-
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
|
385
|
-
dependencies = [
|
386
|
-
"autocfg",
|
387
|
-
]
|
367
|
+
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
388
368
|
|
389
369
|
[[package]]
|
390
370
|
name = "minimal-lexical"
|
@@ -394,9 +374,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
394
374
|
|
395
375
|
[[package]]
|
396
376
|
name = "monostate"
|
397
|
-
version = "0.1.
|
377
|
+
version = "0.1.13"
|
398
378
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
399
|
-
checksum = "
|
379
|
+
checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
|
400
380
|
dependencies = [
|
401
381
|
"monostate-impl",
|
402
382
|
"serde",
|
@@ -404,9 +384,9 @@ dependencies = [
|
|
404
384
|
|
405
385
|
[[package]]
|
406
386
|
name = "monostate-impl"
|
407
|
-
version = "0.1.
|
387
|
+
version = "0.1.13"
|
408
388
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
409
|
-
checksum = "
|
389
|
+
checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
|
410
390
|
dependencies = [
|
411
391
|
"proc-macro2",
|
412
392
|
"quote",
|
@@ -431,9 +411,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
|
431
411
|
|
432
412
|
[[package]]
|
433
413
|
name = "once_cell"
|
434
|
-
version = "1.
|
414
|
+
version = "1.19.0"
|
435
415
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
436
|
-
checksum = "
|
416
|
+
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
437
417
|
|
438
418
|
[[package]]
|
439
419
|
name = "onig"
|
@@ -459,39 +439,36 @@ dependencies = [
|
|
459
439
|
|
460
440
|
[[package]]
|
461
441
|
name = "paste"
|
462
|
-
version = "1.0.
|
442
|
+
version = "1.0.15"
|
463
443
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
464
|
-
checksum = "
|
465
|
-
|
466
|
-
[[package]]
|
467
|
-
name = "peeking_take_while"
|
468
|
-
version = "0.1.2"
|
469
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
470
|
-
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
444
|
+
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
471
445
|
|
472
446
|
[[package]]
|
473
447
|
name = "pkg-config"
|
474
|
-
version = "0.3.
|
448
|
+
version = "0.3.30"
|
475
449
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
476
|
-
checksum = "
|
450
|
+
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
|
477
451
|
|
478
452
|
[[package]]
|
479
453
|
name = "portable-atomic"
|
480
|
-
version = "1.
|
454
|
+
version = "1.7.0"
|
481
455
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
482
|
-
checksum = "
|
456
|
+
checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
|
483
457
|
|
484
458
|
[[package]]
|
485
459
|
name = "ppv-lite86"
|
486
|
-
version = "0.2.
|
460
|
+
version = "0.2.20"
|
487
461
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
488
|
-
checksum = "
|
462
|
+
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
463
|
+
dependencies = [
|
464
|
+
"zerocopy",
|
465
|
+
]
|
489
466
|
|
490
467
|
[[package]]
|
491
468
|
name = "proc-macro2"
|
492
|
-
version = "1.0.
|
469
|
+
version = "1.0.86"
|
493
470
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
494
|
-
checksum = "
|
471
|
+
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
|
495
472
|
dependencies = [
|
496
473
|
"unicode-ident",
|
497
474
|
]
|
@@ -568,18 +545,18 @@ dependencies = [
|
|
568
545
|
|
569
546
|
[[package]]
|
570
547
|
name = "rb-sys"
|
571
|
-
version = "0.9.
|
548
|
+
version = "0.9.100"
|
572
549
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
573
|
-
checksum = "
|
550
|
+
checksum = "87f2ba20be84b32fad6b0ce397764bcdd0f2dca4431cf7035f6a6721e5747565"
|
574
551
|
dependencies = [
|
575
552
|
"rb-sys-build",
|
576
553
|
]
|
577
554
|
|
578
555
|
[[package]]
|
579
556
|
name = "rb-sys-build"
|
580
|
-
version = "0.9.
|
557
|
+
version = "0.9.100"
|
581
558
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
582
|
-
checksum = "
|
559
|
+
checksum = "7ecae2bdcb118ee721d9a3929f89e8578237fade298dfcf8c928609aa88abc48"
|
583
560
|
dependencies = [
|
584
561
|
"bindgen",
|
585
562
|
"lazy_static",
|
@@ -598,9 +575,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
598
575
|
|
599
576
|
[[package]]
|
600
577
|
name = "regex"
|
601
|
-
version = "1.10.
|
578
|
+
version = "1.10.6"
|
602
579
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
603
|
-
checksum = "
|
580
|
+
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
|
604
581
|
dependencies = [
|
605
582
|
"aho-corasick",
|
606
583
|
"memchr",
|
@@ -610,9 +587,9 @@ dependencies = [
|
|
610
587
|
|
611
588
|
[[package]]
|
612
589
|
name = "regex-automata"
|
613
|
-
version = "0.4.
|
590
|
+
version = "0.4.7"
|
614
591
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
615
|
-
checksum = "
|
592
|
+
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
616
593
|
dependencies = [
|
617
594
|
"aho-corasick",
|
618
595
|
"memchr",
|
@@ -621,9 +598,9 @@ dependencies = [
|
|
621
598
|
|
622
599
|
[[package]]
|
623
600
|
name = "regex-syntax"
|
624
|
-
version = "0.8.
|
601
|
+
version = "0.8.4"
|
625
602
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
626
|
-
checksum = "
|
603
|
+
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
627
604
|
|
628
605
|
[[package]]
|
629
606
|
name = "rustc-hash"
|
@@ -633,15 +610,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
633
610
|
|
634
611
|
[[package]]
|
635
612
|
name = "ryu"
|
636
|
-
version = "1.0.
|
613
|
+
version = "1.0.18"
|
637
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
638
|
-
checksum = "
|
639
|
-
|
640
|
-
[[package]]
|
641
|
-
name = "scopeguard"
|
642
|
-
version = "1.1.0"
|
643
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
644
|
-
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
615
|
+
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
645
616
|
|
646
617
|
[[package]]
|
647
618
|
name = "seq-macro"
|
@@ -651,18 +622,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
651
622
|
|
652
623
|
[[package]]
|
653
624
|
name = "serde"
|
654
|
-
version = "1.0.
|
625
|
+
version = "1.0.205"
|
655
626
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
656
|
-
checksum = "
|
627
|
+
checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150"
|
657
628
|
dependencies = [
|
658
629
|
"serde_derive",
|
659
630
|
]
|
660
631
|
|
661
632
|
[[package]]
|
662
633
|
name = "serde_derive"
|
663
|
-
version = "1.0.
|
634
|
+
version = "1.0.205"
|
664
635
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
665
|
-
checksum = "
|
636
|
+
checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1"
|
666
637
|
dependencies = [
|
667
638
|
"proc-macro2",
|
668
639
|
"quote",
|
@@ -671,11 +642,12 @@ dependencies = [
|
|
671
642
|
|
672
643
|
[[package]]
|
673
644
|
name = "serde_json"
|
674
|
-
version = "1.0.
|
645
|
+
version = "1.0.122"
|
675
646
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
676
|
-
checksum = "
|
647
|
+
checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da"
|
677
648
|
dependencies = [
|
678
649
|
"itoa",
|
650
|
+
"memchr",
|
679
651
|
"ryu",
|
680
652
|
"serde",
|
681
653
|
]
|
@@ -688,15 +660,15 @@ checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
|
|
688
660
|
|
689
661
|
[[package]]
|
690
662
|
name = "shlex"
|
691
|
-
version = "1.
|
663
|
+
version = "1.3.0"
|
692
664
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
693
|
-
checksum = "
|
665
|
+
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
694
666
|
|
695
667
|
[[package]]
|
696
668
|
name = "smallvec"
|
697
|
-
version = "1.
|
669
|
+
version = "1.13.2"
|
698
670
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
699
|
-
checksum = "
|
671
|
+
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
700
672
|
|
701
673
|
[[package]]
|
702
674
|
name = "spm_precompiled"
|
@@ -712,15 +684,15 @@ dependencies = [
|
|
712
684
|
|
713
685
|
[[package]]
|
714
686
|
name = "strsim"
|
715
|
-
version = "0.
|
687
|
+
version = "0.11.1"
|
716
688
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
717
|
-
checksum = "
|
689
|
+
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
718
690
|
|
719
691
|
[[package]]
|
720
692
|
name = "syn"
|
721
|
-
version = "2.0.
|
693
|
+
version = "2.0.72"
|
722
694
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
723
|
-
checksum = "
|
695
|
+
checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af"
|
724
696
|
dependencies = [
|
725
697
|
"proc-macro2",
|
726
698
|
"quote",
|
@@ -729,18 +701,18 @@ dependencies = [
|
|
729
701
|
|
730
702
|
[[package]]
|
731
703
|
name = "thiserror"
|
732
|
-
version = "1.0.
|
704
|
+
version = "1.0.63"
|
733
705
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
734
|
-
checksum = "
|
706
|
+
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
|
735
707
|
dependencies = [
|
736
708
|
"thiserror-impl",
|
737
709
|
]
|
738
710
|
|
739
711
|
[[package]]
|
740
712
|
name = "thiserror-impl"
|
741
|
-
version = "1.0.
|
713
|
+
version = "1.0.63"
|
742
714
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
743
|
-
checksum = "
|
715
|
+
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
|
744
716
|
dependencies = [
|
745
717
|
"proc-macro2",
|
746
718
|
"quote",
|
@@ -749,19 +721,19 @@ dependencies = [
|
|
749
721
|
|
750
722
|
[[package]]
|
751
723
|
name = "tokenizers"
|
752
|
-
version = "0.5.
|
724
|
+
version = "0.5.1"
|
753
725
|
dependencies = [
|
754
726
|
"magnus",
|
755
727
|
"onig",
|
756
728
|
"serde",
|
757
|
-
"tokenizers 0.
|
729
|
+
"tokenizers 0.20.0",
|
758
730
|
]
|
759
731
|
|
760
732
|
[[package]]
|
761
733
|
name = "tokenizers"
|
762
|
-
version = "0.
|
734
|
+
version = "0.20.0"
|
763
735
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
764
|
-
checksum = "
|
736
|
+
checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
|
765
737
|
dependencies = [
|
766
738
|
"aho-corasick",
|
767
739
|
"derive_builder",
|
@@ -791,9 +763,9 @@ dependencies = [
|
|
791
763
|
|
792
764
|
[[package]]
|
793
765
|
name = "unicode-ident"
|
794
|
-
version = "1.0.
|
766
|
+
version = "1.0.12"
|
795
767
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
796
|
-
checksum = "
|
768
|
+
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
797
769
|
|
798
770
|
[[package]]
|
799
771
|
name = "unicode-normalization-alignments"
|
@@ -812,9 +784,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
|
812
784
|
|
813
785
|
[[package]]
|
814
786
|
name = "unicode-width"
|
815
|
-
version = "0.1.
|
787
|
+
version = "0.1.13"
|
816
788
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
817
|
-
checksum = "
|
789
|
+
checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
|
818
790
|
|
819
791
|
[[package]]
|
820
792
|
name = "unicode_categories"
|
@@ -829,36 +801,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
829
801
|
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
830
802
|
|
831
803
|
[[package]]
|
832
|
-
name = "
|
833
|
-
version = "0.
|
804
|
+
name = "windows-sys"
|
805
|
+
version = "0.52.0"
|
834
806
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
835
|
-
checksum = "
|
807
|
+
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
836
808
|
dependencies = [
|
837
|
-
"
|
838
|
-
"winapi-x86_64-pc-windows-gnu",
|
809
|
+
"windows-targets",
|
839
810
|
]
|
840
811
|
|
841
812
|
[[package]]
|
842
|
-
name = "
|
843
|
-
version = "0.
|
813
|
+
name = "windows-targets"
|
814
|
+
version = "0.52.6"
|
844
815
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
845
|
-
checksum = "
|
846
|
-
|
847
|
-
[[package]]
|
848
|
-
name = "winapi-x86_64-pc-windows-gnu"
|
849
|
-
version = "0.4.0"
|
850
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
851
|
-
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
852
|
-
|
853
|
-
[[package]]
|
854
|
-
name = "windows-sys"
|
855
|
-
version = "0.42.0"
|
856
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
857
|
-
checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
|
816
|
+
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
858
817
|
dependencies = [
|
859
818
|
"windows_aarch64_gnullvm",
|
860
819
|
"windows_aarch64_msvc",
|
861
820
|
"windows_i686_gnu",
|
821
|
+
"windows_i686_gnullvm",
|
862
822
|
"windows_i686_msvc",
|
863
823
|
"windows_x86_64_gnu",
|
864
824
|
"windows_x86_64_gnullvm",
|
@@ -867,42 +827,69 @@ dependencies = [
|
|
867
827
|
|
868
828
|
[[package]]
|
869
829
|
name = "windows_aarch64_gnullvm"
|
870
|
-
version = "0.
|
830
|
+
version = "0.52.6"
|
871
831
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
872
|
-
checksum = "
|
832
|
+
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
873
833
|
|
874
834
|
[[package]]
|
875
835
|
name = "windows_aarch64_msvc"
|
876
|
-
version = "0.
|
836
|
+
version = "0.52.6"
|
877
837
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
878
|
-
checksum = "
|
838
|
+
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
879
839
|
|
880
840
|
[[package]]
|
881
841
|
name = "windows_i686_gnu"
|
882
|
-
version = "0.
|
842
|
+
version = "0.52.6"
|
883
843
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
884
|
-
checksum = "
|
844
|
+
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
845
|
+
|
846
|
+
[[package]]
|
847
|
+
name = "windows_i686_gnullvm"
|
848
|
+
version = "0.52.6"
|
849
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
850
|
+
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
885
851
|
|
886
852
|
[[package]]
|
887
853
|
name = "windows_i686_msvc"
|
888
|
-
version = "0.
|
854
|
+
version = "0.52.6"
|
889
855
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
890
|
-
checksum = "
|
856
|
+
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
891
857
|
|
892
858
|
[[package]]
|
893
859
|
name = "windows_x86_64_gnu"
|
894
|
-
version = "0.
|
860
|
+
version = "0.52.6"
|
895
861
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
896
|
-
checksum = "
|
862
|
+
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
897
863
|
|
898
864
|
[[package]]
|
899
865
|
name = "windows_x86_64_gnullvm"
|
900
|
-
version = "0.
|
866
|
+
version = "0.52.6"
|
901
867
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
902
|
-
checksum = "
|
868
|
+
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
903
869
|
|
904
870
|
[[package]]
|
905
871
|
name = "windows_x86_64_msvc"
|
906
|
-
version = "0.
|
872
|
+
version = "0.52.6"
|
907
873
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
908
|
-
checksum = "
|
874
|
+
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
875
|
+
|
876
|
+
[[package]]
|
877
|
+
name = "zerocopy"
|
878
|
+
version = "0.7.35"
|
879
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
880
|
+
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
881
|
+
dependencies = [
|
882
|
+
"byteorder",
|
883
|
+
"zerocopy-derive",
|
884
|
+
]
|
885
|
+
|
886
|
+
[[package]]
|
887
|
+
name = "zerocopy-derive"
|
888
|
+
version = "0.7.35"
|
889
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
890
|
+
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
891
|
+
dependencies = [
|
892
|
+
"proc-macro2",
|
893
|
+
"quote",
|
894
|
+
"syn",
|
895
|
+
]
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.5.
|
3
|
+
version = "0.5.1"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,11 +11,11 @@ publish = false
|
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
magnus = "0.
|
14
|
+
magnus = "0.7"
|
15
15
|
onig = { version = "6", default-features = false }
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.
|
19
|
+
version = "=0.20.0" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -222,8 +222,8 @@ pub struct RbSequence {}
|
|
222
222
|
impl RbSequence {
|
223
223
|
fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
|
224
224
|
let mut sequence = Vec::with_capacity(normalizers.len());
|
225
|
-
for n in normalizers.
|
226
|
-
let normalizer: &RbNormalizer = TryConvert::try_convert(n
|
225
|
+
for n in normalizers.into_iter() {
|
226
|
+
let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
|
227
227
|
match &normalizer.normalizer {
|
228
228
|
RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
|
229
229
|
RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
|
@@ -258,8 +258,8 @@ pub struct RbSequence {}
|
|
258
258
|
impl RbSequence {
|
259
259
|
fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
|
260
260
|
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
261
|
-
for n in pre_tokenizers.
|
262
|
-
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n
|
261
|
+
for n in pre_tokenizers.into_iter() {
|
262
|
+
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
|
263
263
|
match &pretokenizer.pretok {
|
264
264
|
RbPreTokenizerTypeWrapper::Sequence(inner) => {
|
265
265
|
sequence.extend(inner.iter().cloned())
|
@@ -282,12 +282,12 @@ impl RbTokenizer {
|
|
282
282
|
add_special_tokens: bool,
|
283
283
|
) -> RbResult<RArray> {
|
284
284
|
let input: Vec<tk::EncodeInput> = input
|
285
|
-
.
|
285
|
+
.into_iter()
|
286
286
|
.map(|o| {
|
287
287
|
let input: tk::EncodeInput = if is_pretokenized {
|
288
|
-
PreTokenizedEncodeInput::try_convert(o
|
288
|
+
PreTokenizedEncodeInput::try_convert(o)?.into()
|
289
289
|
} else {
|
290
|
-
TextEncodeInput::try_convert(o
|
290
|
+
TextEncodeInput::try_convert(o)?.into()
|
291
291
|
};
|
292
292
|
Ok(input)
|
293
293
|
})
|
@@ -319,26 +319,26 @@ impl RbTokenizer {
|
|
319
319
|
.map_err(RbError::from)
|
320
320
|
}
|
321
321
|
|
322
|
-
pub fn set_decoder(&self, decoder:
|
323
|
-
self.tokenizer.borrow_mut().with_decoder(decoder.
|
322
|
+
pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
|
323
|
+
self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
|
324
324
|
}
|
325
325
|
|
326
|
-
pub fn set_pre_tokenizer(&self, pretok:
|
326
|
+
pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
|
327
327
|
self.tokenizer
|
328
328
|
.borrow_mut()
|
329
|
-
.with_pre_tokenizer(pretok.
|
329
|
+
.with_pre_tokenizer(pretok.cloned());
|
330
330
|
}
|
331
331
|
|
332
|
-
pub fn set_post_processor(&self, processor:
|
332
|
+
pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
|
333
333
|
self.tokenizer
|
334
334
|
.borrow_mut()
|
335
|
-
.with_post_processor(processor.
|
335
|
+
.with_post_processor(processor.cloned());
|
336
336
|
}
|
337
337
|
|
338
|
-
pub fn set_normalizer(&self, normalizer:
|
338
|
+
pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
|
339
339
|
self.tokenizer
|
340
340
|
.borrow_mut()
|
341
|
-
.with_normalizer(normalizer.
|
341
|
+
.with_normalizer(normalizer.cloned());
|
342
342
|
}
|
343
343
|
|
344
344
|
pub fn token_to_id(&self, token: String) -> Option<u32> {
|
@@ -110,9 +110,9 @@ impl RbTrainer {
|
|
110
110
|
BpeTrainer,
|
111
111
|
special_tokens,
|
112
112
|
special_tokens
|
113
|
-
.
|
113
|
+
.into_iter()
|
114
114
|
.map(|token| {
|
115
|
-
if let Ok(content) = String::try_convert(token
|
115
|
+
if let Ok(content) = String::try_convert(token) {
|
116
116
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
117
117
|
} else {
|
118
118
|
todo!()
|
@@ -197,9 +197,9 @@ impl RbTrainer {
|
|
197
197
|
UnigramTrainer,
|
198
198
|
special_tokens,
|
199
199
|
special_tokens
|
200
|
-
.
|
200
|
+
.into_iter()
|
201
201
|
.map(|token| {
|
202
|
-
if let Ok(content) = String::try_convert(token
|
202
|
+
if let Ok(content) = String::try_convert(token) {
|
203
203
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
204
204
|
} else {
|
205
205
|
todo!()
|
@@ -268,9 +268,9 @@ impl RbTrainer {
|
|
268
268
|
WordLevelTrainer,
|
269
269
|
special_tokens,
|
270
270
|
special_tokens
|
271
|
-
.
|
271
|
+
.into_iter()
|
272
272
|
.map(|token| {
|
273
|
-
if let Ok(content) = String::try_convert(token
|
273
|
+
if let Ok(content) = String::try_convert(token) {
|
274
274
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
275
275
|
} else {
|
276
276
|
todo!()
|
@@ -322,9 +322,9 @@ impl RbTrainer {
|
|
322
322
|
WordPieceTrainer,
|
323
323
|
@set_special_tokens,
|
324
324
|
special_tokens
|
325
|
-
.
|
325
|
+
.into_iter()
|
326
326
|
.map(|token| {
|
327
|
-
if let Ok(content) = String::try_convert(token
|
327
|
+
if let Ok(content) = String::try_convert(token) {
|
328
328
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
329
329
|
} else {
|
330
330
|
todo!()
|
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
|
|
398
398
|
if !value.is_nil() {
|
399
399
|
builder = builder.special_tokens(
|
400
400
|
RArray::try_convert(value)?
|
401
|
-
.
|
401
|
+
.into_iter()
|
402
402
|
.map(|token| {
|
403
|
-
if let Ok(content) = String::try_convert(token
|
403
|
+
if let Ok(content) = String::try_convert(token) {
|
404
404
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
405
405
|
} else {
|
406
406
|
todo!()
|
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
|
|
466
466
|
if !value.is_nil() {
|
467
467
|
builder.special_tokens(
|
468
468
|
RArray::try_convert(value)?
|
469
|
-
.
|
469
|
+
.into_iter()
|
470
470
|
.map(|token| {
|
471
|
-
if let Ok(content) = String::try_convert(token
|
471
|
+
if let Ok(content) = String::try_convert(token) {
|
472
472
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
473
473
|
} else {
|
474
474
|
todo!()
|
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
|
|
540
540
|
if !value.is_nil() {
|
541
541
|
builder.special_tokens(
|
542
542
|
RArray::try_convert(value)?
|
543
|
-
.
|
543
|
+
.into_iter()
|
544
544
|
.map(|token| {
|
545
|
-
if let Ok(content) = String::try_convert(token
|
545
|
+
if let Ok(content) = String::try_convert(token) {
|
546
546
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
547
547
|
} else {
|
548
548
|
todo!()
|
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
|
|
581
581
|
if !value.is_nil() {
|
582
582
|
builder = builder.special_tokens(
|
583
583
|
RArray::try_convert(value)?
|
584
|
-
.
|
584
|
+
.into_iter()
|
585
585
|
.map(|token| {
|
586
|
-
if let Ok(content) = String::try_convert(token
|
586
|
+
if let Ok(content) = String::try_convert(token) {
|
587
587
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
588
588
|
} else {
|
589
589
|
todo!()
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module FromPretrained
|
3
3
|
# for user agent
|
4
|
-
TOKENIZERS_VERSION = "0.
|
4
|
+
TOKENIZERS_VERSION = "0.20.0"
|
5
5
|
|
6
6
|
# use Ruby for downloads
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
@@ -67,7 +67,7 @@ module Tokenizers
|
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
|
-
options[:content_length_proc] = ->
|
70
|
+
options[:content_length_proc] = ->(_) { puts "Downloading..." }
|
71
71
|
|
72
72
|
# string options are headers
|
73
73
|
tempfile = URI.parse(url).open(headers.merge(options))
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.5.
|
103
|
+
rubygems_version: 3.5.11
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|