tokenizers 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +162 -175
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/normalizers.rs +2 -2
- data/ext/tokenizers/src/pre_tokenizers.rs +2 -2
- data/ext/tokenizers/src/tokenizer.rs +11 -11
- data/ext/tokenizers/src/trainers.rs +16 -16
- data/lib/tokenizers/from_pretrained.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4846b5d3dc0fe8f5828ddffe46908b1f3812ebf6a03a939ca0395ad7748533bb
|
4
|
+
data.tar.gz: 259795bfa6b13a36f62ab2ffb65e9feabd460e01efe9f59e7d6017c6dcd9b9b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90f55feb8ceec81815bb61b7773e6f67924eb6d47869e5da67d579e2ac9df6a48fddee5e97f5a028e3fa1e39941f3bfe6ec6c04cf49fd1b87f18bade54911231
|
7
|
+
data.tar.gz: 9c1895b43222494b393f3fbddaa6e78216e025f3d3dddebf0c0311d2d897b282a16b8b0044aacb2466790b0a93c8d01099b25d662750b96d5798d0a4a927267b
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -4,19 +4,13 @@ version = 3
|
|
4
4
|
|
5
5
|
[[package]]
|
6
6
|
name = "aho-corasick"
|
7
|
-
version = "1.1.
|
7
|
+
version = "1.1.3"
|
8
8
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
|
-
checksum = "
|
9
|
+
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
10
10
|
dependencies = [
|
11
11
|
"memchr",
|
12
12
|
]
|
13
13
|
|
14
|
-
[[package]]
|
15
|
-
name = "autocfg"
|
16
|
-
version = "1.1.0"
|
17
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
18
|
-
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
19
|
-
|
20
14
|
[[package]]
|
21
15
|
name = "base64"
|
22
16
|
version = "0.13.1"
|
@@ -25,16 +19,16 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
19
|
|
26
20
|
[[package]]
|
27
21
|
name = "bindgen"
|
28
|
-
version = "0.69.
|
22
|
+
version = "0.69.4"
|
29
23
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
24
|
+
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
|
31
25
|
dependencies = [
|
32
|
-
"bitflags 2.
|
26
|
+
"bitflags 2.6.0",
|
33
27
|
"cexpr",
|
34
28
|
"clang-sys",
|
29
|
+
"itertools 0.12.1",
|
35
30
|
"lazy_static",
|
36
31
|
"lazycell",
|
37
|
-
"peeking_take_while",
|
38
32
|
"proc-macro2",
|
39
33
|
"quote",
|
40
34
|
"regex",
|
@@ -51,15 +45,21 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
51
45
|
|
52
46
|
[[package]]
|
53
47
|
name = "bitflags"
|
54
|
-
version = "2.
|
48
|
+
version = "2.6.0"
|
55
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
-
checksum = "
|
50
|
+
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
51
|
+
|
52
|
+
[[package]]
|
53
|
+
name = "byteorder"
|
54
|
+
version = "1.5.0"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
57
57
|
|
58
58
|
[[package]]
|
59
59
|
name = "cc"
|
60
|
-
version = "1.
|
60
|
+
version = "1.1.8"
|
61
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
62
|
+
checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549"
|
63
63
|
|
64
64
|
[[package]]
|
65
65
|
name = "cexpr"
|
@@ -78,9 +78,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
78
78
|
|
79
79
|
[[package]]
|
80
80
|
name = "clang-sys"
|
81
|
-
version = "1.
|
81
|
+
version = "1.8.1"
|
82
82
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
83
|
-
checksum = "
|
83
|
+
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
|
84
84
|
dependencies = [
|
85
85
|
"glob",
|
86
86
|
"libc",
|
@@ -89,9 +89,9 @@ dependencies = [
|
|
89
89
|
|
90
90
|
[[package]]
|
91
91
|
name = "console"
|
92
|
-
version = "0.15.
|
92
|
+
version = "0.15.8"
|
93
93
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
94
|
-
checksum = "
|
94
|
+
checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
|
95
95
|
dependencies = [
|
96
96
|
"encode_unicode",
|
97
97
|
"lazy_static",
|
@@ -102,42 +102,34 @@ dependencies = [
|
|
102
102
|
|
103
103
|
[[package]]
|
104
104
|
name = "crossbeam-deque"
|
105
|
-
version = "0.8.
|
105
|
+
version = "0.8.5"
|
106
106
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
107
|
-
checksum = "
|
107
|
+
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
|
108
108
|
dependencies = [
|
109
|
-
"cfg-if",
|
110
109
|
"crossbeam-epoch",
|
111
110
|
"crossbeam-utils",
|
112
111
|
]
|
113
112
|
|
114
113
|
[[package]]
|
115
114
|
name = "crossbeam-epoch"
|
116
|
-
version = "0.9.
|
115
|
+
version = "0.9.18"
|
117
116
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
118
|
-
checksum = "
|
117
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
119
118
|
dependencies = [
|
120
|
-
"autocfg",
|
121
|
-
"cfg-if",
|
122
119
|
"crossbeam-utils",
|
123
|
-
"memoffset",
|
124
|
-
"scopeguard",
|
125
120
|
]
|
126
121
|
|
127
122
|
[[package]]
|
128
123
|
name = "crossbeam-utils"
|
129
|
-
version = "0.8.
|
124
|
+
version = "0.8.20"
|
130
125
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
131
|
-
checksum = "
|
132
|
-
dependencies = [
|
133
|
-
"cfg-if",
|
134
|
-
]
|
126
|
+
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
|
135
127
|
|
136
128
|
[[package]]
|
137
129
|
name = "darling"
|
138
|
-
version = "0.20.
|
130
|
+
version = "0.20.10"
|
139
131
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
140
|
-
checksum = "
|
132
|
+
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
|
141
133
|
dependencies = [
|
142
134
|
"darling_core",
|
143
135
|
"darling_macro",
|
@@ -145,9 +137,9 @@ dependencies = [
|
|
145
137
|
|
146
138
|
[[package]]
|
147
139
|
name = "darling_core"
|
148
|
-
version = "0.20.
|
140
|
+
version = "0.20.10"
|
149
141
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
150
|
-
checksum = "
|
142
|
+
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
|
151
143
|
dependencies = [
|
152
144
|
"fnv",
|
153
145
|
"ident_case",
|
@@ -159,9 +151,9 @@ dependencies = [
|
|
159
151
|
|
160
152
|
[[package]]
|
161
153
|
name = "darling_macro"
|
162
|
-
version = "0.20.
|
154
|
+
version = "0.20.10"
|
163
155
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
164
|
-
checksum = "
|
156
|
+
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
165
157
|
dependencies = [
|
166
158
|
"darling_core",
|
167
159
|
"quote",
|
@@ -201,9 +193,9 @@ dependencies = [
|
|
201
193
|
|
202
194
|
[[package]]
|
203
195
|
name = "either"
|
204
|
-
version = "1.
|
196
|
+
version = "1.13.0"
|
205
197
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
206
|
-
checksum = "
|
198
|
+
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
207
199
|
|
208
200
|
[[package]]
|
209
201
|
name = "encode_unicode"
|
@@ -228,9 +220,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
228
220
|
|
229
221
|
[[package]]
|
230
222
|
name = "getrandom"
|
231
|
-
version = "0.2.
|
223
|
+
version = "0.2.15"
|
232
224
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
233
|
-
checksum = "
|
225
|
+
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
234
226
|
dependencies = [
|
235
227
|
"cfg-if",
|
236
228
|
"libc",
|
@@ -251,9 +243,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|
251
243
|
|
252
244
|
[[package]]
|
253
245
|
name = "indicatif"
|
254
|
-
version = "0.17.
|
246
|
+
version = "0.17.8"
|
255
247
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
256
|
-
checksum = "
|
248
|
+
checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
|
257
249
|
dependencies = [
|
258
250
|
"console",
|
259
251
|
"instant",
|
@@ -264,9 +256,9 @@ dependencies = [
|
|
264
256
|
|
265
257
|
[[package]]
|
266
258
|
name = "instant"
|
267
|
-
version = "0.1.
|
259
|
+
version = "0.1.13"
|
268
260
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
269
|
-
checksum = "
|
261
|
+
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
270
262
|
dependencies = [
|
271
263
|
"cfg-if",
|
272
264
|
]
|
@@ -291,15 +283,15 @@ dependencies = [
|
|
291
283
|
|
292
284
|
[[package]]
|
293
285
|
name = "itoa"
|
294
|
-
version = "1.0.
|
286
|
+
version = "1.0.11"
|
295
287
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
296
|
-
checksum = "
|
288
|
+
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
297
289
|
|
298
290
|
[[package]]
|
299
291
|
name = "lazy_static"
|
300
|
-
version = "1.
|
292
|
+
version = "1.5.0"
|
301
293
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
302
|
-
checksum = "
|
294
|
+
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
303
295
|
|
304
296
|
[[package]]
|
305
297
|
name = "lazycell"
|
@@ -309,28 +301,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
309
301
|
|
310
302
|
[[package]]
|
311
303
|
name = "libc"
|
312
|
-
version = "0.2.
|
304
|
+
version = "0.2.155"
|
313
305
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
314
|
-
checksum = "
|
306
|
+
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
|
315
307
|
|
316
308
|
[[package]]
|
317
309
|
name = "libloading"
|
318
|
-
version = "0.
|
310
|
+
version = "0.8.5"
|
319
311
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
320
|
-
checksum = "
|
312
|
+
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
|
321
313
|
dependencies = [
|
322
314
|
"cfg-if",
|
323
|
-
"
|
315
|
+
"windows-targets",
|
324
316
|
]
|
325
317
|
|
326
318
|
[[package]]
|
327
319
|
name = "log"
|
328
|
-
version = "0.4.
|
320
|
+
version = "0.4.22"
|
329
321
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
330
|
-
checksum = "
|
331
|
-
dependencies = [
|
332
|
-
"cfg-if",
|
333
|
-
]
|
322
|
+
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
334
323
|
|
335
324
|
[[package]]
|
336
325
|
name = "macro_rules_attribute"
|
@@ -350,9 +339,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
350
339
|
|
351
340
|
[[package]]
|
352
341
|
name = "magnus"
|
353
|
-
version = "0.
|
342
|
+
version = "0.7.1"
|
354
343
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
355
|
-
checksum = "
|
344
|
+
checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
|
356
345
|
dependencies = [
|
357
346
|
"magnus-macros",
|
358
347
|
"rb-sys",
|
@@ -373,18 +362,9 @@ dependencies = [
|
|
373
362
|
|
374
363
|
[[package]]
|
375
364
|
name = "memchr"
|
376
|
-
version = "2.
|
365
|
+
version = "2.7.4"
|
377
366
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
378
|
-
checksum = "
|
379
|
-
|
380
|
-
[[package]]
|
381
|
-
name = "memoffset"
|
382
|
-
version = "0.8.0"
|
383
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
384
|
-
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
|
385
|
-
dependencies = [
|
386
|
-
"autocfg",
|
387
|
-
]
|
367
|
+
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
388
368
|
|
389
369
|
[[package]]
|
390
370
|
name = "minimal-lexical"
|
@@ -394,9 +374,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
394
374
|
|
395
375
|
[[package]]
|
396
376
|
name = "monostate"
|
397
|
-
version = "0.1.
|
377
|
+
version = "0.1.13"
|
398
378
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
399
|
-
checksum = "
|
379
|
+
checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
|
400
380
|
dependencies = [
|
401
381
|
"monostate-impl",
|
402
382
|
"serde",
|
@@ -404,9 +384,9 @@ dependencies = [
|
|
404
384
|
|
405
385
|
[[package]]
|
406
386
|
name = "monostate-impl"
|
407
|
-
version = "0.1.
|
387
|
+
version = "0.1.13"
|
408
388
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
409
|
-
checksum = "
|
389
|
+
checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
|
410
390
|
dependencies = [
|
411
391
|
"proc-macro2",
|
412
392
|
"quote",
|
@@ -431,9 +411,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
|
431
411
|
|
432
412
|
[[package]]
|
433
413
|
name = "once_cell"
|
434
|
-
version = "1.
|
414
|
+
version = "1.19.0"
|
435
415
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
436
|
-
checksum = "
|
416
|
+
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
437
417
|
|
438
418
|
[[package]]
|
439
419
|
name = "onig"
|
@@ -459,39 +439,36 @@ dependencies = [
|
|
459
439
|
|
460
440
|
[[package]]
|
461
441
|
name = "paste"
|
462
|
-
version = "1.0.
|
442
|
+
version = "1.0.15"
|
463
443
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
464
|
-
checksum = "
|
465
|
-
|
466
|
-
[[package]]
|
467
|
-
name = "peeking_take_while"
|
468
|
-
version = "0.1.2"
|
469
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
470
|
-
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
444
|
+
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
471
445
|
|
472
446
|
[[package]]
|
473
447
|
name = "pkg-config"
|
474
|
-
version = "0.3.
|
448
|
+
version = "0.3.30"
|
475
449
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
476
|
-
checksum = "
|
450
|
+
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
|
477
451
|
|
478
452
|
[[package]]
|
479
453
|
name = "portable-atomic"
|
480
|
-
version = "1.
|
454
|
+
version = "1.7.0"
|
481
455
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
482
|
-
checksum = "
|
456
|
+
checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
|
483
457
|
|
484
458
|
[[package]]
|
485
459
|
name = "ppv-lite86"
|
486
|
-
version = "0.2.
|
460
|
+
version = "0.2.20"
|
487
461
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
488
|
-
checksum = "
|
462
|
+
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
463
|
+
dependencies = [
|
464
|
+
"zerocopy",
|
465
|
+
]
|
489
466
|
|
490
467
|
[[package]]
|
491
468
|
name = "proc-macro2"
|
492
|
-
version = "1.0.
|
469
|
+
version = "1.0.86"
|
493
470
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
494
|
-
checksum = "
|
471
|
+
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
|
495
472
|
dependencies = [
|
496
473
|
"unicode-ident",
|
497
474
|
]
|
@@ -568,18 +545,18 @@ dependencies = [
|
|
568
545
|
|
569
546
|
[[package]]
|
570
547
|
name = "rb-sys"
|
571
|
-
version = "0.9.
|
548
|
+
version = "0.9.100"
|
572
549
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
573
|
-
checksum = "
|
550
|
+
checksum = "87f2ba20be84b32fad6b0ce397764bcdd0f2dca4431cf7035f6a6721e5747565"
|
574
551
|
dependencies = [
|
575
552
|
"rb-sys-build",
|
576
553
|
]
|
577
554
|
|
578
555
|
[[package]]
|
579
556
|
name = "rb-sys-build"
|
580
|
-
version = "0.9.
|
557
|
+
version = "0.9.100"
|
581
558
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
582
|
-
checksum = "
|
559
|
+
checksum = "7ecae2bdcb118ee721d9a3929f89e8578237fade298dfcf8c928609aa88abc48"
|
583
560
|
dependencies = [
|
584
561
|
"bindgen",
|
585
562
|
"lazy_static",
|
@@ -598,9 +575,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
598
575
|
|
599
576
|
[[package]]
|
600
577
|
name = "regex"
|
601
|
-
version = "1.10.
|
578
|
+
version = "1.10.6"
|
602
579
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
603
|
-
checksum = "
|
580
|
+
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
|
604
581
|
dependencies = [
|
605
582
|
"aho-corasick",
|
606
583
|
"memchr",
|
@@ -610,9 +587,9 @@ dependencies = [
|
|
610
587
|
|
611
588
|
[[package]]
|
612
589
|
name = "regex-automata"
|
613
|
-
version = "0.4.
|
590
|
+
version = "0.4.7"
|
614
591
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
615
|
-
checksum = "
|
592
|
+
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
616
593
|
dependencies = [
|
617
594
|
"aho-corasick",
|
618
595
|
"memchr",
|
@@ -621,9 +598,9 @@ dependencies = [
|
|
621
598
|
|
622
599
|
[[package]]
|
623
600
|
name = "regex-syntax"
|
624
|
-
version = "0.8.
|
601
|
+
version = "0.8.4"
|
625
602
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
626
|
-
checksum = "
|
603
|
+
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
627
604
|
|
628
605
|
[[package]]
|
629
606
|
name = "rustc-hash"
|
@@ -633,15 +610,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
633
610
|
|
634
611
|
[[package]]
|
635
612
|
name = "ryu"
|
636
|
-
version = "1.0.
|
613
|
+
version = "1.0.18"
|
637
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
638
|
-
checksum = "
|
639
|
-
|
640
|
-
[[package]]
|
641
|
-
name = "scopeguard"
|
642
|
-
version = "1.1.0"
|
643
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
644
|
-
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
615
|
+
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
645
616
|
|
646
617
|
[[package]]
|
647
618
|
name = "seq-macro"
|
@@ -651,18 +622,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
651
622
|
|
652
623
|
[[package]]
|
653
624
|
name = "serde"
|
654
|
-
version = "1.0.
|
625
|
+
version = "1.0.205"
|
655
626
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
656
|
-
checksum = "
|
627
|
+
checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150"
|
657
628
|
dependencies = [
|
658
629
|
"serde_derive",
|
659
630
|
]
|
660
631
|
|
661
632
|
[[package]]
|
662
633
|
name = "serde_derive"
|
663
|
-
version = "1.0.
|
634
|
+
version = "1.0.205"
|
664
635
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
665
|
-
checksum = "
|
636
|
+
checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1"
|
666
637
|
dependencies = [
|
667
638
|
"proc-macro2",
|
668
639
|
"quote",
|
@@ -671,11 +642,12 @@ dependencies = [
|
|
671
642
|
|
672
643
|
[[package]]
|
673
644
|
name = "serde_json"
|
674
|
-
version = "1.0.
|
645
|
+
version = "1.0.122"
|
675
646
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
676
|
-
checksum = "
|
647
|
+
checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da"
|
677
648
|
dependencies = [
|
678
649
|
"itoa",
|
650
|
+
"memchr",
|
679
651
|
"ryu",
|
680
652
|
"serde",
|
681
653
|
]
|
@@ -688,15 +660,15 @@ checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
|
|
688
660
|
|
689
661
|
[[package]]
|
690
662
|
name = "shlex"
|
691
|
-
version = "1.
|
663
|
+
version = "1.3.0"
|
692
664
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
693
|
-
checksum = "
|
665
|
+
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
694
666
|
|
695
667
|
[[package]]
|
696
668
|
name = "smallvec"
|
697
|
-
version = "1.
|
669
|
+
version = "1.13.2"
|
698
670
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
699
|
-
checksum = "
|
671
|
+
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
700
672
|
|
701
673
|
[[package]]
|
702
674
|
name = "spm_precompiled"
|
@@ -712,15 +684,15 @@ dependencies = [
|
|
712
684
|
|
713
685
|
[[package]]
|
714
686
|
name = "strsim"
|
715
|
-
version = "0.
|
687
|
+
version = "0.11.1"
|
716
688
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
717
|
-
checksum = "
|
689
|
+
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
718
690
|
|
719
691
|
[[package]]
|
720
692
|
name = "syn"
|
721
|
-
version = "2.0.
|
693
|
+
version = "2.0.72"
|
722
694
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
723
|
-
checksum = "
|
695
|
+
checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af"
|
724
696
|
dependencies = [
|
725
697
|
"proc-macro2",
|
726
698
|
"quote",
|
@@ -729,18 +701,18 @@ dependencies = [
|
|
729
701
|
|
730
702
|
[[package]]
|
731
703
|
name = "thiserror"
|
732
|
-
version = "1.0.
|
704
|
+
version = "1.0.63"
|
733
705
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
734
|
-
checksum = "
|
706
|
+
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
|
735
707
|
dependencies = [
|
736
708
|
"thiserror-impl",
|
737
709
|
]
|
738
710
|
|
739
711
|
[[package]]
|
740
712
|
name = "thiserror-impl"
|
741
|
-
version = "1.0.
|
713
|
+
version = "1.0.63"
|
742
714
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
743
|
-
checksum = "
|
715
|
+
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
|
744
716
|
dependencies = [
|
745
717
|
"proc-macro2",
|
746
718
|
"quote",
|
@@ -749,19 +721,19 @@ dependencies = [
|
|
749
721
|
|
750
722
|
[[package]]
|
751
723
|
name = "tokenizers"
|
752
|
-
version = "0.5.
|
724
|
+
version = "0.5.1"
|
753
725
|
dependencies = [
|
754
726
|
"magnus",
|
755
727
|
"onig",
|
756
728
|
"serde",
|
757
|
-
"tokenizers 0.
|
729
|
+
"tokenizers 0.20.0",
|
758
730
|
]
|
759
731
|
|
760
732
|
[[package]]
|
761
733
|
name = "tokenizers"
|
762
|
-
version = "0.
|
734
|
+
version = "0.20.0"
|
763
735
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
764
|
-
checksum = "
|
736
|
+
checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
|
765
737
|
dependencies = [
|
766
738
|
"aho-corasick",
|
767
739
|
"derive_builder",
|
@@ -791,9 +763,9 @@ dependencies = [
|
|
791
763
|
|
792
764
|
[[package]]
|
793
765
|
name = "unicode-ident"
|
794
|
-
version = "1.0.
|
766
|
+
version = "1.0.12"
|
795
767
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
796
|
-
checksum = "
|
768
|
+
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
797
769
|
|
798
770
|
[[package]]
|
799
771
|
name = "unicode-normalization-alignments"
|
@@ -812,9 +784,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
|
812
784
|
|
813
785
|
[[package]]
|
814
786
|
name = "unicode-width"
|
815
|
-
version = "0.1.
|
787
|
+
version = "0.1.13"
|
816
788
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
817
|
-
checksum = "
|
789
|
+
checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
|
818
790
|
|
819
791
|
[[package]]
|
820
792
|
name = "unicode_categories"
|
@@ -829,36 +801,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
829
801
|
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
830
802
|
|
831
803
|
[[package]]
|
832
|
-
name = "
|
833
|
-
version = "0.
|
804
|
+
name = "windows-sys"
|
805
|
+
version = "0.52.0"
|
834
806
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
835
|
-
checksum = "
|
807
|
+
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
836
808
|
dependencies = [
|
837
|
-
"
|
838
|
-
"winapi-x86_64-pc-windows-gnu",
|
809
|
+
"windows-targets",
|
839
810
|
]
|
840
811
|
|
841
812
|
[[package]]
|
842
|
-
name = "
|
843
|
-
version = "0.
|
813
|
+
name = "windows-targets"
|
814
|
+
version = "0.52.6"
|
844
815
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
845
|
-
checksum = "
|
846
|
-
|
847
|
-
[[package]]
|
848
|
-
name = "winapi-x86_64-pc-windows-gnu"
|
849
|
-
version = "0.4.0"
|
850
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
851
|
-
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
852
|
-
|
853
|
-
[[package]]
|
854
|
-
name = "windows-sys"
|
855
|
-
version = "0.42.0"
|
856
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
857
|
-
checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
|
816
|
+
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
858
817
|
dependencies = [
|
859
818
|
"windows_aarch64_gnullvm",
|
860
819
|
"windows_aarch64_msvc",
|
861
820
|
"windows_i686_gnu",
|
821
|
+
"windows_i686_gnullvm",
|
862
822
|
"windows_i686_msvc",
|
863
823
|
"windows_x86_64_gnu",
|
864
824
|
"windows_x86_64_gnullvm",
|
@@ -867,42 +827,69 @@ dependencies = [
|
|
867
827
|
|
868
828
|
[[package]]
|
869
829
|
name = "windows_aarch64_gnullvm"
|
870
|
-
version = "0.
|
830
|
+
version = "0.52.6"
|
871
831
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
872
|
-
checksum = "
|
832
|
+
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
873
833
|
|
874
834
|
[[package]]
|
875
835
|
name = "windows_aarch64_msvc"
|
876
|
-
version = "0.
|
836
|
+
version = "0.52.6"
|
877
837
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
878
|
-
checksum = "
|
838
|
+
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
879
839
|
|
880
840
|
[[package]]
|
881
841
|
name = "windows_i686_gnu"
|
882
|
-
version = "0.
|
842
|
+
version = "0.52.6"
|
883
843
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
884
|
-
checksum = "
|
844
|
+
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
845
|
+
|
846
|
+
[[package]]
|
847
|
+
name = "windows_i686_gnullvm"
|
848
|
+
version = "0.52.6"
|
849
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
850
|
+
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
885
851
|
|
886
852
|
[[package]]
|
887
853
|
name = "windows_i686_msvc"
|
888
|
-
version = "0.
|
854
|
+
version = "0.52.6"
|
889
855
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
890
|
-
checksum = "
|
856
|
+
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
891
857
|
|
892
858
|
[[package]]
|
893
859
|
name = "windows_x86_64_gnu"
|
894
|
-
version = "0.
|
860
|
+
version = "0.52.6"
|
895
861
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
896
|
-
checksum = "
|
862
|
+
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
897
863
|
|
898
864
|
[[package]]
|
899
865
|
name = "windows_x86_64_gnullvm"
|
900
|
-
version = "0.
|
866
|
+
version = "0.52.6"
|
901
867
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
902
|
-
checksum = "
|
868
|
+
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
903
869
|
|
904
870
|
[[package]]
|
905
871
|
name = "windows_x86_64_msvc"
|
906
|
-
version = "0.
|
872
|
+
version = "0.52.6"
|
907
873
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
908
|
-
checksum = "
|
874
|
+
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
875
|
+
|
876
|
+
[[package]]
|
877
|
+
name = "zerocopy"
|
878
|
+
version = "0.7.35"
|
879
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
880
|
+
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
881
|
+
dependencies = [
|
882
|
+
"byteorder",
|
883
|
+
"zerocopy-derive",
|
884
|
+
]
|
885
|
+
|
886
|
+
[[package]]
|
887
|
+
name = "zerocopy-derive"
|
888
|
+
version = "0.7.35"
|
889
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
890
|
+
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
891
|
+
dependencies = [
|
892
|
+
"proc-macro2",
|
893
|
+
"quote",
|
894
|
+
"syn",
|
895
|
+
]
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.5.
|
3
|
+
version = "0.5.1"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,11 +11,11 @@ publish = false
|
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
magnus = "0.
|
14
|
+
magnus = "0.7"
|
15
15
|
onig = { version = "6", default-features = false }
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.
|
19
|
+
version = "=0.20.0" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -222,8 +222,8 @@ pub struct RbSequence {}
|
|
222
222
|
impl RbSequence {
|
223
223
|
fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
|
224
224
|
let mut sequence = Vec::with_capacity(normalizers.len());
|
225
|
-
for n in normalizers.
|
226
|
-
let normalizer: &RbNormalizer = TryConvert::try_convert(n
|
225
|
+
for n in normalizers.into_iter() {
|
226
|
+
let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
|
227
227
|
match &normalizer.normalizer {
|
228
228
|
RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
|
229
229
|
RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
|
@@ -258,8 +258,8 @@ pub struct RbSequence {}
|
|
258
258
|
impl RbSequence {
|
259
259
|
fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
|
260
260
|
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
261
|
-
for n in pre_tokenizers.
|
262
|
-
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n
|
261
|
+
for n in pre_tokenizers.into_iter() {
|
262
|
+
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
|
263
263
|
match &pretokenizer.pretok {
|
264
264
|
RbPreTokenizerTypeWrapper::Sequence(inner) => {
|
265
265
|
sequence.extend(inner.iter().cloned())
|
@@ -282,12 +282,12 @@ impl RbTokenizer {
|
|
282
282
|
add_special_tokens: bool,
|
283
283
|
) -> RbResult<RArray> {
|
284
284
|
let input: Vec<tk::EncodeInput> = input
|
285
|
-
.
|
285
|
+
.into_iter()
|
286
286
|
.map(|o| {
|
287
287
|
let input: tk::EncodeInput = if is_pretokenized {
|
288
|
-
PreTokenizedEncodeInput::try_convert(o
|
288
|
+
PreTokenizedEncodeInput::try_convert(o)?.into()
|
289
289
|
} else {
|
290
|
-
TextEncodeInput::try_convert(o
|
290
|
+
TextEncodeInput::try_convert(o)?.into()
|
291
291
|
};
|
292
292
|
Ok(input)
|
293
293
|
})
|
@@ -319,26 +319,26 @@ impl RbTokenizer {
|
|
319
319
|
.map_err(RbError::from)
|
320
320
|
}
|
321
321
|
|
322
|
-
pub fn set_decoder(&self, decoder:
|
323
|
-
self.tokenizer.borrow_mut().with_decoder(decoder.
|
322
|
+
pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
|
323
|
+
self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
|
324
324
|
}
|
325
325
|
|
326
|
-
pub fn set_pre_tokenizer(&self, pretok:
|
326
|
+
pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
|
327
327
|
self.tokenizer
|
328
328
|
.borrow_mut()
|
329
|
-
.with_pre_tokenizer(pretok.
|
329
|
+
.with_pre_tokenizer(pretok.cloned());
|
330
330
|
}
|
331
331
|
|
332
|
-
pub fn set_post_processor(&self, processor:
|
332
|
+
pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
|
333
333
|
self.tokenizer
|
334
334
|
.borrow_mut()
|
335
|
-
.with_post_processor(processor.
|
335
|
+
.with_post_processor(processor.cloned());
|
336
336
|
}
|
337
337
|
|
338
|
-
pub fn set_normalizer(&self, normalizer:
|
338
|
+
pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
|
339
339
|
self.tokenizer
|
340
340
|
.borrow_mut()
|
341
|
-
.with_normalizer(normalizer.
|
341
|
+
.with_normalizer(normalizer.cloned());
|
342
342
|
}
|
343
343
|
|
344
344
|
pub fn token_to_id(&self, token: String) -> Option<u32> {
|
@@ -110,9 +110,9 @@ impl RbTrainer {
|
|
110
110
|
BpeTrainer,
|
111
111
|
special_tokens,
|
112
112
|
special_tokens
|
113
|
-
.
|
113
|
+
.into_iter()
|
114
114
|
.map(|token| {
|
115
|
-
if let Ok(content) = String::try_convert(token
|
115
|
+
if let Ok(content) = String::try_convert(token) {
|
116
116
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
117
117
|
} else {
|
118
118
|
todo!()
|
@@ -197,9 +197,9 @@ impl RbTrainer {
|
|
197
197
|
UnigramTrainer,
|
198
198
|
special_tokens,
|
199
199
|
special_tokens
|
200
|
-
.
|
200
|
+
.into_iter()
|
201
201
|
.map(|token| {
|
202
|
-
if let Ok(content) = String::try_convert(token
|
202
|
+
if let Ok(content) = String::try_convert(token) {
|
203
203
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
204
204
|
} else {
|
205
205
|
todo!()
|
@@ -268,9 +268,9 @@ impl RbTrainer {
|
|
268
268
|
WordLevelTrainer,
|
269
269
|
special_tokens,
|
270
270
|
special_tokens
|
271
|
-
.
|
271
|
+
.into_iter()
|
272
272
|
.map(|token| {
|
273
|
-
if let Ok(content) = String::try_convert(token
|
273
|
+
if let Ok(content) = String::try_convert(token) {
|
274
274
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
275
275
|
} else {
|
276
276
|
todo!()
|
@@ -322,9 +322,9 @@ impl RbTrainer {
|
|
322
322
|
WordPieceTrainer,
|
323
323
|
@set_special_tokens,
|
324
324
|
special_tokens
|
325
|
-
.
|
325
|
+
.into_iter()
|
326
326
|
.map(|token| {
|
327
|
-
if let Ok(content) = String::try_convert(token
|
327
|
+
if let Ok(content) = String::try_convert(token) {
|
328
328
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
329
329
|
} else {
|
330
330
|
todo!()
|
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
|
|
398
398
|
if !value.is_nil() {
|
399
399
|
builder = builder.special_tokens(
|
400
400
|
RArray::try_convert(value)?
|
401
|
-
.
|
401
|
+
.into_iter()
|
402
402
|
.map(|token| {
|
403
|
-
if let Ok(content) = String::try_convert(token
|
403
|
+
if let Ok(content) = String::try_convert(token) {
|
404
404
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
405
405
|
} else {
|
406
406
|
todo!()
|
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
|
|
466
466
|
if !value.is_nil() {
|
467
467
|
builder.special_tokens(
|
468
468
|
RArray::try_convert(value)?
|
469
|
-
.
|
469
|
+
.into_iter()
|
470
470
|
.map(|token| {
|
471
|
-
if let Ok(content) = String::try_convert(token
|
471
|
+
if let Ok(content) = String::try_convert(token) {
|
472
472
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
473
473
|
} else {
|
474
474
|
todo!()
|
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
|
|
540
540
|
if !value.is_nil() {
|
541
541
|
builder.special_tokens(
|
542
542
|
RArray::try_convert(value)?
|
543
|
-
.
|
543
|
+
.into_iter()
|
544
544
|
.map(|token| {
|
545
|
-
if let Ok(content) = String::try_convert(token
|
545
|
+
if let Ok(content) = String::try_convert(token) {
|
546
546
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
547
547
|
} else {
|
548
548
|
todo!()
|
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
|
|
581
581
|
if !value.is_nil() {
|
582
582
|
builder = builder.special_tokens(
|
583
583
|
RArray::try_convert(value)?
|
584
|
-
.
|
584
|
+
.into_iter()
|
585
585
|
.map(|token| {
|
586
|
-
if let Ok(content) = String::try_convert(token
|
586
|
+
if let Ok(content) = String::try_convert(token) {
|
587
587
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
588
588
|
} else {
|
589
589
|
todo!()
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module FromPretrained
|
3
3
|
# for user agent
|
4
|
-
TOKENIZERS_VERSION = "0.
|
4
|
+
TOKENIZERS_VERSION = "0.20.0"
|
5
5
|
|
6
6
|
# use Ruby for downloads
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
@@ -67,7 +67,7 @@ module Tokenizers
|
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
|
-
options[:content_length_proc] = ->
|
70
|
+
options[:content_length_proc] = ->(_) { puts "Downloading..." }
|
71
71
|
|
72
72
|
# string options are headers
|
73
73
|
tempfile = URI.parse(url).open(headers.merge(options))
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.5.
|
103
|
+
rubygems_version: 3.5.11
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|