tokenizers 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +167 -177
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/decoders.rs +7 -0
- data/ext/tokenizers/src/lib.rs +7 -0
- data/ext/tokenizers/src/normalizers.rs +2 -2
- data/ext/tokenizers/src/pre_tokenizers.rs +2 -2
- data/ext/tokenizers/src/tokenizer.rs +45 -12
- data/ext/tokenizers/src/trainers.rs +16 -16
- data/lib/tokenizers/from_pretrained.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
|
4
|
+
data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
|
7
|
+
data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## 0.5.2 (2024-08-26)
|
2
|
+
|
3
|
+
- Added `from_str` method to `Tokenizer`
|
4
|
+
- Added `model` and `model=` methods to `Tokenizer`
|
5
|
+
- Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
|
6
|
+
- Added `decode` method to `Decoder`
|
7
|
+
|
8
|
+
## 0.5.1 (2024-08-13)
|
9
|
+
|
10
|
+
- Updated Tokenizers to 0.20.0
|
11
|
+
- Added precompiled gem for Linux ARM MUSL
|
12
|
+
|
1
13
|
## 0.5.0 (2024-05-21)
|
2
14
|
|
3
15
|
- Updated Tokenizers to 0.19.1
|
data/Cargo.lock
CHANGED
@@ -4,19 +4,13 @@ version = 3
|
|
4
4
|
|
5
5
|
[[package]]
|
6
6
|
name = "aho-corasick"
|
7
|
-
version = "1.1.
|
7
|
+
version = "1.1.3"
|
8
8
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
|
-
checksum = "
|
9
|
+
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
10
10
|
dependencies = [
|
11
11
|
"memchr",
|
12
12
|
]
|
13
13
|
|
14
|
-
[[package]]
|
15
|
-
name = "autocfg"
|
16
|
-
version = "1.1.0"
|
17
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
18
|
-
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
19
|
-
|
20
14
|
[[package]]
|
21
15
|
name = "base64"
|
22
16
|
version = "0.13.1"
|
@@ -25,16 +19,16 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
19
|
|
26
20
|
[[package]]
|
27
21
|
name = "bindgen"
|
28
|
-
version = "0.69.
|
22
|
+
version = "0.69.4"
|
29
23
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
24
|
+
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
|
31
25
|
dependencies = [
|
32
|
-
"bitflags 2.
|
26
|
+
"bitflags 2.6.0",
|
33
27
|
"cexpr",
|
34
28
|
"clang-sys",
|
29
|
+
"itertools 0.12.1",
|
35
30
|
"lazy_static",
|
36
31
|
"lazycell",
|
37
|
-
"peeking_take_while",
|
38
32
|
"proc-macro2",
|
39
33
|
"quote",
|
40
34
|
"regex",
|
@@ -51,15 +45,24 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
51
45
|
|
52
46
|
[[package]]
|
53
47
|
name = "bitflags"
|
54
|
-
version = "2.
|
48
|
+
version = "2.6.0"
|
55
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
-
checksum = "
|
50
|
+
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
51
|
+
|
52
|
+
[[package]]
|
53
|
+
name = "byteorder"
|
54
|
+
version = "1.5.0"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
57
57
|
|
58
58
|
[[package]]
|
59
59
|
name = "cc"
|
60
|
-
version = "1.
|
60
|
+
version = "1.1.15"
|
61
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
62
|
+
checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
|
63
|
+
dependencies = [
|
64
|
+
"shlex",
|
65
|
+
]
|
63
66
|
|
64
67
|
[[package]]
|
65
68
|
name = "cexpr"
|
@@ -78,9 +81,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
78
81
|
|
79
82
|
[[package]]
|
80
83
|
name = "clang-sys"
|
81
|
-
version = "1.
|
84
|
+
version = "1.8.1"
|
82
85
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
83
|
-
checksum = "
|
86
|
+
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
|
84
87
|
dependencies = [
|
85
88
|
"glob",
|
86
89
|
"libc",
|
@@ -89,9 +92,9 @@ dependencies = [
|
|
89
92
|
|
90
93
|
[[package]]
|
91
94
|
name = "console"
|
92
|
-
version = "0.15.
|
95
|
+
version = "0.15.8"
|
93
96
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
94
|
-
checksum = "
|
97
|
+
checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
|
95
98
|
dependencies = [
|
96
99
|
"encode_unicode",
|
97
100
|
"lazy_static",
|
@@ -102,42 +105,34 @@ dependencies = [
|
|
102
105
|
|
103
106
|
[[package]]
|
104
107
|
name = "crossbeam-deque"
|
105
|
-
version = "0.8.
|
108
|
+
version = "0.8.5"
|
106
109
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
107
|
-
checksum = "
|
110
|
+
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
|
108
111
|
dependencies = [
|
109
|
-
"cfg-if",
|
110
112
|
"crossbeam-epoch",
|
111
113
|
"crossbeam-utils",
|
112
114
|
]
|
113
115
|
|
114
116
|
[[package]]
|
115
117
|
name = "crossbeam-epoch"
|
116
|
-
version = "0.9.
|
118
|
+
version = "0.9.18"
|
117
119
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
118
|
-
checksum = "
|
120
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
119
121
|
dependencies = [
|
120
|
-
"autocfg",
|
121
|
-
"cfg-if",
|
122
122
|
"crossbeam-utils",
|
123
|
-
"memoffset",
|
124
|
-
"scopeguard",
|
125
123
|
]
|
126
124
|
|
127
125
|
[[package]]
|
128
126
|
name = "crossbeam-utils"
|
129
|
-
version = "0.8.
|
127
|
+
version = "0.8.20"
|
130
128
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
131
|
-
checksum = "
|
132
|
-
dependencies = [
|
133
|
-
"cfg-if",
|
134
|
-
]
|
129
|
+
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
|
135
130
|
|
136
131
|
[[package]]
|
137
132
|
name = "darling"
|
138
|
-
version = "0.20.
|
133
|
+
version = "0.20.10"
|
139
134
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
140
|
-
checksum = "
|
135
|
+
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
|
141
136
|
dependencies = [
|
142
137
|
"darling_core",
|
143
138
|
"darling_macro",
|
@@ -145,9 +140,9 @@ dependencies = [
|
|
145
140
|
|
146
141
|
[[package]]
|
147
142
|
name = "darling_core"
|
148
|
-
version = "0.20.
|
143
|
+
version = "0.20.10"
|
149
144
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
150
|
-
checksum = "
|
145
|
+
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
|
151
146
|
dependencies = [
|
152
147
|
"fnv",
|
153
148
|
"ident_case",
|
@@ -159,9 +154,9 @@ dependencies = [
|
|
159
154
|
|
160
155
|
[[package]]
|
161
156
|
name = "darling_macro"
|
162
|
-
version = "0.20.
|
157
|
+
version = "0.20.10"
|
163
158
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
164
|
-
checksum = "
|
159
|
+
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
165
160
|
dependencies = [
|
166
161
|
"darling_core",
|
167
162
|
"quote",
|
@@ -201,9 +196,9 @@ dependencies = [
|
|
201
196
|
|
202
197
|
[[package]]
|
203
198
|
name = "either"
|
204
|
-
version = "1.
|
199
|
+
version = "1.13.0"
|
205
200
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
206
|
-
checksum = "
|
201
|
+
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
207
202
|
|
208
203
|
[[package]]
|
209
204
|
name = "encode_unicode"
|
@@ -228,9 +223,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
228
223
|
|
229
224
|
[[package]]
|
230
225
|
name = "getrandom"
|
231
|
-
version = "0.2.
|
226
|
+
version = "0.2.15"
|
232
227
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
233
|
-
checksum = "
|
228
|
+
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
234
229
|
dependencies = [
|
235
230
|
"cfg-if",
|
236
231
|
"libc",
|
@@ -251,9 +246,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|
251
246
|
|
252
247
|
[[package]]
|
253
248
|
name = "indicatif"
|
254
|
-
version = "0.17.
|
249
|
+
version = "0.17.8"
|
255
250
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
256
|
-
checksum = "
|
251
|
+
checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
|
257
252
|
dependencies = [
|
258
253
|
"console",
|
259
254
|
"instant",
|
@@ -264,9 +259,9 @@ dependencies = [
|
|
264
259
|
|
265
260
|
[[package]]
|
266
261
|
name = "instant"
|
267
|
-
version = "0.1.
|
262
|
+
version = "0.1.13"
|
268
263
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
269
|
-
checksum = "
|
264
|
+
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
270
265
|
dependencies = [
|
271
266
|
"cfg-if",
|
272
267
|
]
|
@@ -291,15 +286,15 @@ dependencies = [
|
|
291
286
|
|
292
287
|
[[package]]
|
293
288
|
name = "itoa"
|
294
|
-
version = "1.0.
|
289
|
+
version = "1.0.11"
|
295
290
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
296
|
-
checksum = "
|
291
|
+
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
297
292
|
|
298
293
|
[[package]]
|
299
294
|
name = "lazy_static"
|
300
|
-
version = "1.
|
295
|
+
version = "1.5.0"
|
301
296
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
302
|
-
checksum = "
|
297
|
+
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
303
298
|
|
304
299
|
[[package]]
|
305
300
|
name = "lazycell"
|
@@ -309,28 +304,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
309
304
|
|
310
305
|
[[package]]
|
311
306
|
name = "libc"
|
312
|
-
version = "0.2.
|
307
|
+
version = "0.2.158"
|
313
308
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
314
|
-
checksum = "
|
309
|
+
checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
|
315
310
|
|
316
311
|
[[package]]
|
317
312
|
name = "libloading"
|
318
|
-
version = "0.
|
313
|
+
version = "0.8.5"
|
319
314
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
320
|
-
checksum = "
|
315
|
+
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
|
321
316
|
dependencies = [
|
322
317
|
"cfg-if",
|
323
|
-
"
|
318
|
+
"windows-targets",
|
324
319
|
]
|
325
320
|
|
326
321
|
[[package]]
|
327
322
|
name = "log"
|
328
|
-
version = "0.4.
|
323
|
+
version = "0.4.22"
|
329
324
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
330
|
-
checksum = "
|
331
|
-
dependencies = [
|
332
|
-
"cfg-if",
|
333
|
-
]
|
325
|
+
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
334
326
|
|
335
327
|
[[package]]
|
336
328
|
name = "macro_rules_attribute"
|
@@ -350,9 +342,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
350
342
|
|
351
343
|
[[package]]
|
352
344
|
name = "magnus"
|
353
|
-
version = "0.
|
345
|
+
version = "0.7.1"
|
354
346
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
355
|
-
checksum = "
|
347
|
+
checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
|
356
348
|
dependencies = [
|
357
349
|
"magnus-macros",
|
358
350
|
"rb-sys",
|
@@ -373,18 +365,9 @@ dependencies = [
|
|
373
365
|
|
374
366
|
[[package]]
|
375
367
|
name = "memchr"
|
376
|
-
version = "2.
|
377
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
378
|
-
checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
|
379
|
-
|
380
|
-
[[package]]
|
381
|
-
name = "memoffset"
|
382
|
-
version = "0.8.0"
|
368
|
+
version = "2.7.4"
|
383
369
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
384
|
-
checksum = "
|
385
|
-
dependencies = [
|
386
|
-
"autocfg",
|
387
|
-
]
|
370
|
+
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
388
371
|
|
389
372
|
[[package]]
|
390
373
|
name = "minimal-lexical"
|
@@ -394,9 +377,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
394
377
|
|
395
378
|
[[package]]
|
396
379
|
name = "monostate"
|
397
|
-
version = "0.1.
|
380
|
+
version = "0.1.13"
|
398
381
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
399
|
-
checksum = "
|
382
|
+
checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
|
400
383
|
dependencies = [
|
401
384
|
"monostate-impl",
|
402
385
|
"serde",
|
@@ -404,9 +387,9 @@ dependencies = [
|
|
404
387
|
|
405
388
|
[[package]]
|
406
389
|
name = "monostate-impl"
|
407
|
-
version = "0.1.
|
390
|
+
version = "0.1.13"
|
408
391
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
409
|
-
checksum = "
|
392
|
+
checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
|
410
393
|
dependencies = [
|
411
394
|
"proc-macro2",
|
412
395
|
"quote",
|
@@ -431,9 +414,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
|
431
414
|
|
432
415
|
[[package]]
|
433
416
|
name = "once_cell"
|
434
|
-
version = "1.
|
417
|
+
version = "1.19.0"
|
435
418
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
436
|
-
checksum = "
|
419
|
+
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
437
420
|
|
438
421
|
[[package]]
|
439
422
|
name = "onig"
|
@@ -459,48 +442,45 @@ dependencies = [
|
|
459
442
|
|
460
443
|
[[package]]
|
461
444
|
name = "paste"
|
462
|
-
version = "1.0.
|
445
|
+
version = "1.0.15"
|
463
446
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
464
|
-
checksum = "
|
465
|
-
|
466
|
-
[[package]]
|
467
|
-
name = "peeking_take_while"
|
468
|
-
version = "0.1.2"
|
469
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
470
|
-
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
447
|
+
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
471
448
|
|
472
449
|
[[package]]
|
473
450
|
name = "pkg-config"
|
474
|
-
version = "0.3.
|
451
|
+
version = "0.3.30"
|
475
452
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
476
|
-
checksum = "
|
453
|
+
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
|
477
454
|
|
478
455
|
[[package]]
|
479
456
|
name = "portable-atomic"
|
480
|
-
version = "1.
|
457
|
+
version = "1.7.0"
|
481
458
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
482
|
-
checksum = "
|
459
|
+
checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
|
483
460
|
|
484
461
|
[[package]]
|
485
462
|
name = "ppv-lite86"
|
486
|
-
version = "0.2.
|
463
|
+
version = "0.2.20"
|
487
464
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
488
|
-
checksum = "
|
465
|
+
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
466
|
+
dependencies = [
|
467
|
+
"zerocopy",
|
468
|
+
]
|
489
469
|
|
490
470
|
[[package]]
|
491
471
|
name = "proc-macro2"
|
492
|
-
version = "1.0.
|
472
|
+
version = "1.0.86"
|
493
473
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
494
|
-
checksum = "
|
474
|
+
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
|
495
475
|
dependencies = [
|
496
476
|
"unicode-ident",
|
497
477
|
]
|
498
478
|
|
499
479
|
[[package]]
|
500
480
|
name = "quote"
|
501
|
-
version = "1.0.
|
481
|
+
version = "1.0.37"
|
502
482
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
503
|
-
checksum = "
|
483
|
+
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
504
484
|
dependencies = [
|
505
485
|
"proc-macro2",
|
506
486
|
]
|
@@ -568,18 +548,18 @@ dependencies = [
|
|
568
548
|
|
569
549
|
[[package]]
|
570
550
|
name = "rb-sys"
|
571
|
-
version = "0.9.
|
551
|
+
version = "0.9.102"
|
572
552
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
573
|
-
checksum = "
|
553
|
+
checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
|
574
554
|
dependencies = [
|
575
555
|
"rb-sys-build",
|
576
556
|
]
|
577
557
|
|
578
558
|
[[package]]
|
579
559
|
name = "rb-sys-build"
|
580
|
-
version = "0.9.
|
560
|
+
version = "0.9.102"
|
581
561
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
582
|
-
checksum = "
|
562
|
+
checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
|
583
563
|
dependencies = [
|
584
564
|
"bindgen",
|
585
565
|
"lazy_static",
|
@@ -598,9 +578,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
598
578
|
|
599
579
|
[[package]]
|
600
580
|
name = "regex"
|
601
|
-
version = "1.10.
|
581
|
+
version = "1.10.6"
|
602
582
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
603
|
-
checksum = "
|
583
|
+
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
|
604
584
|
dependencies = [
|
605
585
|
"aho-corasick",
|
606
586
|
"memchr",
|
@@ -610,9 +590,9 @@ dependencies = [
|
|
610
590
|
|
611
591
|
[[package]]
|
612
592
|
name = "regex-automata"
|
613
|
-
version = "0.4.
|
593
|
+
version = "0.4.7"
|
614
594
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
615
|
-
checksum = "
|
595
|
+
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
616
596
|
dependencies = [
|
617
597
|
"aho-corasick",
|
618
598
|
"memchr",
|
@@ -621,9 +601,9 @@ dependencies = [
|
|
621
601
|
|
622
602
|
[[package]]
|
623
603
|
name = "regex-syntax"
|
624
|
-
version = "0.8.
|
604
|
+
version = "0.8.4"
|
625
605
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
626
|
-
checksum = "
|
606
|
+
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
627
607
|
|
628
608
|
[[package]]
|
629
609
|
name = "rustc-hash"
|
@@ -633,15 +613,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
633
613
|
|
634
614
|
[[package]]
|
635
615
|
name = "ryu"
|
636
|
-
version = "1.0.
|
637
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
638
|
-
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
|
639
|
-
|
640
|
-
[[package]]
|
641
|
-
name = "scopeguard"
|
642
|
-
version = "1.1.0"
|
616
|
+
version = "1.0.18"
|
643
617
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
644
|
-
checksum = "
|
618
|
+
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
645
619
|
|
646
620
|
[[package]]
|
647
621
|
name = "seq-macro"
|
@@ -651,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
651
625
|
|
652
626
|
[[package]]
|
653
627
|
name = "serde"
|
654
|
-
version = "1.0.
|
628
|
+
version = "1.0.209"
|
655
629
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
656
|
-
checksum = "
|
630
|
+
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
657
631
|
dependencies = [
|
658
632
|
"serde_derive",
|
659
633
|
]
|
660
634
|
|
661
635
|
[[package]]
|
662
636
|
name = "serde_derive"
|
663
|
-
version = "1.0.
|
637
|
+
version = "1.0.209"
|
664
638
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
665
|
-
checksum = "
|
639
|
+
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
666
640
|
dependencies = [
|
667
641
|
"proc-macro2",
|
668
642
|
"quote",
|
@@ -671,11 +645,12 @@ dependencies = [
|
|
671
645
|
|
672
646
|
[[package]]
|
673
647
|
name = "serde_json"
|
674
|
-
version = "1.0.
|
648
|
+
version = "1.0.127"
|
675
649
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
676
|
-
checksum = "
|
650
|
+
checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
|
677
651
|
dependencies = [
|
678
652
|
"itoa",
|
653
|
+
"memchr",
|
679
654
|
"ryu",
|
680
655
|
"serde",
|
681
656
|
]
|
@@ -688,15 +663,15 @@ checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
|
|
688
663
|
|
689
664
|
[[package]]
|
690
665
|
name = "shlex"
|
691
|
-
version = "1.
|
666
|
+
version = "1.3.0"
|
692
667
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
693
|
-
checksum = "
|
668
|
+
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
694
669
|
|
695
670
|
[[package]]
|
696
671
|
name = "smallvec"
|
697
|
-
version = "1.
|
672
|
+
version = "1.13.2"
|
698
673
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
699
|
-
checksum = "
|
674
|
+
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
700
675
|
|
701
676
|
[[package]]
|
702
677
|
name = "spm_precompiled"
|
@@ -712,15 +687,15 @@ dependencies = [
|
|
712
687
|
|
713
688
|
[[package]]
|
714
689
|
name = "strsim"
|
715
|
-
version = "0.
|
690
|
+
version = "0.11.1"
|
716
691
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
717
|
-
checksum = "
|
692
|
+
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
718
693
|
|
719
694
|
[[package]]
|
720
695
|
name = "syn"
|
721
|
-
version = "2.0.
|
696
|
+
version = "2.0.76"
|
722
697
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
723
|
-
checksum = "
|
698
|
+
checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
|
724
699
|
dependencies = [
|
725
700
|
"proc-macro2",
|
726
701
|
"quote",
|
@@ -729,18 +704,18 @@ dependencies = [
|
|
729
704
|
|
730
705
|
[[package]]
|
731
706
|
name = "thiserror"
|
732
|
-
version = "1.0.
|
707
|
+
version = "1.0.63"
|
733
708
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
734
|
-
checksum = "
|
709
|
+
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
|
735
710
|
dependencies = [
|
736
711
|
"thiserror-impl",
|
737
712
|
]
|
738
713
|
|
739
714
|
[[package]]
|
740
715
|
name = "thiserror-impl"
|
741
|
-
version = "1.0.
|
716
|
+
version = "1.0.63"
|
742
717
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
743
|
-
checksum = "
|
718
|
+
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
|
744
719
|
dependencies = [
|
745
720
|
"proc-macro2",
|
746
721
|
"quote",
|
@@ -749,19 +724,19 @@ dependencies = [
|
|
749
724
|
|
750
725
|
[[package]]
|
751
726
|
name = "tokenizers"
|
752
|
-
version = "0.5.
|
727
|
+
version = "0.5.2"
|
753
728
|
dependencies = [
|
754
729
|
"magnus",
|
755
730
|
"onig",
|
756
731
|
"serde",
|
757
|
-
"tokenizers 0.
|
732
|
+
"tokenizers 0.20.0",
|
758
733
|
]
|
759
734
|
|
760
735
|
[[package]]
|
761
736
|
name = "tokenizers"
|
762
|
-
version = "0.
|
737
|
+
version = "0.20.0"
|
763
738
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
764
|
-
checksum = "
|
739
|
+
checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
|
765
740
|
dependencies = [
|
766
741
|
"aho-corasick",
|
767
742
|
"derive_builder",
|
@@ -791,9 +766,9 @@ dependencies = [
|
|
791
766
|
|
792
767
|
[[package]]
|
793
768
|
name = "unicode-ident"
|
794
|
-
version = "1.0.
|
769
|
+
version = "1.0.12"
|
795
770
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
796
|
-
checksum = "
|
771
|
+
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
797
772
|
|
798
773
|
[[package]]
|
799
774
|
name = "unicode-normalization-alignments"
|
@@ -812,9 +787,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
|
812
787
|
|
813
788
|
[[package]]
|
814
789
|
name = "unicode-width"
|
815
|
-
version = "0.1.
|
790
|
+
version = "0.1.13"
|
816
791
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
817
|
-
checksum = "
|
792
|
+
checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
|
818
793
|
|
819
794
|
[[package]]
|
820
795
|
name = "unicode_categories"
|
@@ -829,36 +804,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
829
804
|
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
830
805
|
|
831
806
|
[[package]]
|
832
|
-
name = "
|
833
|
-
version = "0.
|
807
|
+
name = "windows-sys"
|
808
|
+
version = "0.52.0"
|
834
809
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
835
|
-
checksum = "
|
810
|
+
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
836
811
|
dependencies = [
|
837
|
-
"
|
838
|
-
"winapi-x86_64-pc-windows-gnu",
|
812
|
+
"windows-targets",
|
839
813
|
]
|
840
814
|
|
841
815
|
[[package]]
|
842
|
-
name = "
|
843
|
-
version = "0.
|
816
|
+
name = "windows-targets"
|
817
|
+
version = "0.52.6"
|
844
818
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
845
|
-
checksum = "
|
846
|
-
|
847
|
-
[[package]]
|
848
|
-
name = "winapi-x86_64-pc-windows-gnu"
|
849
|
-
version = "0.4.0"
|
850
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
851
|
-
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
852
|
-
|
853
|
-
[[package]]
|
854
|
-
name = "windows-sys"
|
855
|
-
version = "0.42.0"
|
856
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
857
|
-
checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
|
819
|
+
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
858
820
|
dependencies = [
|
859
821
|
"windows_aarch64_gnullvm",
|
860
822
|
"windows_aarch64_msvc",
|
861
823
|
"windows_i686_gnu",
|
824
|
+
"windows_i686_gnullvm",
|
862
825
|
"windows_i686_msvc",
|
863
826
|
"windows_x86_64_gnu",
|
864
827
|
"windows_x86_64_gnullvm",
|
@@ -867,42 +830,69 @@ dependencies = [
|
|
867
830
|
|
868
831
|
[[package]]
|
869
832
|
name = "windows_aarch64_gnullvm"
|
870
|
-
version = "0.
|
833
|
+
version = "0.52.6"
|
871
834
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
872
|
-
checksum = "
|
835
|
+
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
873
836
|
|
874
837
|
[[package]]
|
875
838
|
name = "windows_aarch64_msvc"
|
876
|
-
version = "0.
|
839
|
+
version = "0.52.6"
|
877
840
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
878
|
-
checksum = "
|
841
|
+
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
879
842
|
|
880
843
|
[[package]]
|
881
844
|
name = "windows_i686_gnu"
|
882
|
-
version = "0.
|
845
|
+
version = "0.52.6"
|
846
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
847
|
+
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
848
|
+
|
849
|
+
[[package]]
|
850
|
+
name = "windows_i686_gnullvm"
|
851
|
+
version = "0.52.6"
|
883
852
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
884
|
-
checksum = "
|
853
|
+
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
885
854
|
|
886
855
|
[[package]]
|
887
856
|
name = "windows_i686_msvc"
|
888
|
-
version = "0.
|
857
|
+
version = "0.52.6"
|
889
858
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
890
|
-
checksum = "
|
859
|
+
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
891
860
|
|
892
861
|
[[package]]
|
893
862
|
name = "windows_x86_64_gnu"
|
894
|
-
version = "0.
|
863
|
+
version = "0.52.6"
|
895
864
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
896
|
-
checksum = "
|
865
|
+
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
897
866
|
|
898
867
|
[[package]]
|
899
868
|
name = "windows_x86_64_gnullvm"
|
900
|
-
version = "0.
|
869
|
+
version = "0.52.6"
|
901
870
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
902
|
-
checksum = "
|
871
|
+
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
903
872
|
|
904
873
|
[[package]]
|
905
874
|
name = "windows_x86_64_msvc"
|
906
|
-
version = "0.
|
875
|
+
version = "0.52.6"
|
876
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
877
|
+
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
878
|
+
|
879
|
+
[[package]]
|
880
|
+
name = "zerocopy"
|
881
|
+
version = "0.7.35"
|
882
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
883
|
+
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
884
|
+
dependencies = [
|
885
|
+
"byteorder",
|
886
|
+
"zerocopy-derive",
|
887
|
+
]
|
888
|
+
|
889
|
+
[[package]]
|
890
|
+
name = "zerocopy-derive"
|
891
|
+
version = "0.7.35"
|
907
892
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
908
|
-
checksum = "
|
893
|
+
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
894
|
+
dependencies = [
|
895
|
+
"proc-macro2",
|
896
|
+
"quote",
|
897
|
+
"syn",
|
898
|
+
]
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.5.
|
3
|
+
version = "0.5.2"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,11 +11,11 @@ publish = false
|
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
magnus = "0.
|
14
|
+
magnus = "0.7"
|
15
15
|
onig = { version = "6", default-features = false }
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.
|
19
|
+
version = "=0.20.0" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
|
|
34
34
|
}
|
35
35
|
}
|
36
36
|
|
37
|
+
impl RbDecoder {
|
38
|
+
pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
|
39
|
+
self.decoder.decode(tokens).map_err(RbError::from)
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
37
43
|
macro_rules! getter {
|
38
44
|
($self: ident, $variant: ident, $($name: tt)+) => {{
|
39
45
|
let decoder = &$self.decoder;
|
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
|
|
358
364
|
|
359
365
|
pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
360
366
|
let decoder = module.define_class("Decoder", ruby.class_object())?;
|
367
|
+
decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
|
361
368
|
|
362
369
|
let class = module.define_class("BPEDecoder", decoder)?;
|
363
370
|
class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
42
42
|
|
43
43
|
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
44
44
|
class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
|
45
|
+
class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
|
45
46
|
class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
46
47
|
class.define_method(
|
47
48
|
"add_special_tokens",
|
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
54
55
|
class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
|
55
56
|
class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
|
56
57
|
class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
|
58
|
+
class.define_method("model", method!(RbTokenizer::get_model, 0))?;
|
59
|
+
class.define_method("model=", method!(RbTokenizer::set_model,1))?;
|
60
|
+
class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
|
57
61
|
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
62
|
+
class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
|
58
63
|
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
64
|
+
class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
|
59
65
|
class.define_method(
|
60
66
|
"post_processor=",
|
61
67
|
method!(RbTokenizer::set_post_processor, 1),
|
62
68
|
)?;
|
69
|
+
class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
|
63
70
|
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
64
71
|
class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
|
65
72
|
class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
|
@@ -222,8 +222,8 @@ pub struct RbSequence {}
|
|
222
222
|
impl RbSequence {
|
223
223
|
fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
|
224
224
|
let mut sequence = Vec::with_capacity(normalizers.len());
|
225
|
-
for n in normalizers.
|
226
|
-
let normalizer: &RbNormalizer = TryConvert::try_convert(n
|
225
|
+
for n in normalizers.into_iter() {
|
226
|
+
let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
|
227
227
|
match &normalizer.normalizer {
|
228
228
|
RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
|
229
229
|
RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
|
@@ -258,8 +258,8 @@ pub struct RbSequence {}
|
|
258
258
|
impl RbSequence {
|
259
259
|
fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
|
260
260
|
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
261
|
-
for n in pre_tokenizers.
|
262
|
-
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n
|
261
|
+
for n in pre_tokenizers.into_iter() {
|
262
|
+
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
|
263
263
|
match &pretokenizer.pretok {
|
264
264
|
RbPreTokenizerTypeWrapper::Sequence(inner) => {
|
265
265
|
sequence.extend(inner.iter().cloned())
|
@@ -1,9 +1,10 @@
|
|
1
1
|
use std::cell::RefCell;
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::path::PathBuf;
|
4
|
+
use std::str::FromStr;
|
4
5
|
|
5
6
|
use magnus::prelude::*;
|
6
|
-
use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
|
7
|
+
use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
|
7
8
|
use tk::tokenizer::{
|
8
9
|
Model, PaddingDirection, PaddingParams, PaddingStrategy,
|
9
10
|
TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
|
@@ -203,6 +204,14 @@ impl RbTokenizer {
|
|
203
204
|
RbTokenizer::new(TokenizerImpl::new(model.clone()))
|
204
205
|
}
|
205
206
|
|
207
|
+
pub fn from_str(json: RString) -> RbResult<Self> {
|
208
|
+
Tokenizer::from_str(unsafe { json.as_str()? })
|
209
|
+
.map(|v| RbTokenizer {
|
210
|
+
tokenizer: RefCell::new(v),
|
211
|
+
})
|
212
|
+
.map_err(RbError::from)
|
213
|
+
}
|
214
|
+
|
206
215
|
pub fn from_file(path: PathBuf) -> RbResult<Self> {
|
207
216
|
Tokenizer::from_file(path)
|
208
217
|
.map(|v| RbTokenizer {
|
@@ -282,12 +291,12 @@ impl RbTokenizer {
|
|
282
291
|
add_special_tokens: bool,
|
283
292
|
) -> RbResult<RArray> {
|
284
293
|
let input: Vec<tk::EncodeInput> = input
|
285
|
-
.
|
294
|
+
.into_iter()
|
286
295
|
.map(|o| {
|
287
296
|
let input: tk::EncodeInput = if is_pretokenized {
|
288
|
-
PreTokenizedEncodeInput::try_convert(o
|
297
|
+
PreTokenizedEncodeInput::try_convert(o)?.into()
|
289
298
|
} else {
|
290
|
-
TextEncodeInput::try_convert(o
|
299
|
+
TextEncodeInput::try_convert(o)?.into()
|
291
300
|
};
|
292
301
|
Ok(input)
|
293
302
|
})
|
@@ -319,26 +328,50 @@ impl RbTokenizer {
|
|
319
328
|
.map_err(RbError::from)
|
320
329
|
}
|
321
330
|
|
322
|
-
pub fn
|
323
|
-
self.tokenizer.
|
331
|
+
pub fn get_model(&self) -> RbModel {
|
332
|
+
self.tokenizer.borrow().get_model().clone()
|
333
|
+
}
|
334
|
+
|
335
|
+
pub fn set_model(&self, model: &RbModel) {
|
336
|
+
self.tokenizer.borrow_mut().with_model(model.clone());
|
337
|
+
}
|
338
|
+
|
339
|
+
pub fn get_decoder(&self) -> Option<RbDecoder> {
|
340
|
+
self.tokenizer.borrow().get_decoder().cloned()
|
341
|
+
}
|
342
|
+
|
343
|
+
pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
|
344
|
+
self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
|
345
|
+
}
|
346
|
+
|
347
|
+
pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
|
348
|
+
self.tokenizer.borrow().get_pre_tokenizer().cloned()
|
324
349
|
}
|
325
350
|
|
326
|
-
pub fn set_pre_tokenizer(&self, pretok:
|
351
|
+
pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
|
327
352
|
self.tokenizer
|
328
353
|
.borrow_mut()
|
329
|
-
.with_pre_tokenizer(pretok.
|
354
|
+
.with_pre_tokenizer(pretok.cloned());
|
330
355
|
}
|
331
356
|
|
332
|
-
pub fn
|
357
|
+
pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
|
358
|
+
self.tokenizer.borrow().get_post_processor().cloned()
|
359
|
+
}
|
360
|
+
|
361
|
+
pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
|
333
362
|
self.tokenizer
|
334
363
|
.borrow_mut()
|
335
|
-
.with_post_processor(processor.
|
364
|
+
.with_post_processor(processor.cloned());
|
365
|
+
}
|
366
|
+
|
367
|
+
pub fn get_normalizer(&self) -> Option<RbNormalizer> {
|
368
|
+
self.tokenizer.borrow().get_normalizer().cloned()
|
336
369
|
}
|
337
370
|
|
338
|
-
pub fn set_normalizer(&self, normalizer:
|
371
|
+
pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
|
339
372
|
self.tokenizer
|
340
373
|
.borrow_mut()
|
341
|
-
.with_normalizer(normalizer.
|
374
|
+
.with_normalizer(normalizer.cloned());
|
342
375
|
}
|
343
376
|
|
344
377
|
pub fn token_to_id(&self, token: String) -> Option<u32> {
|
@@ -110,9 +110,9 @@ impl RbTrainer {
|
|
110
110
|
BpeTrainer,
|
111
111
|
special_tokens,
|
112
112
|
special_tokens
|
113
|
-
.
|
113
|
+
.into_iter()
|
114
114
|
.map(|token| {
|
115
|
-
if let Ok(content) = String::try_convert(token
|
115
|
+
if let Ok(content) = String::try_convert(token) {
|
116
116
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
117
117
|
} else {
|
118
118
|
todo!()
|
@@ -197,9 +197,9 @@ impl RbTrainer {
|
|
197
197
|
UnigramTrainer,
|
198
198
|
special_tokens,
|
199
199
|
special_tokens
|
200
|
-
.
|
200
|
+
.into_iter()
|
201
201
|
.map(|token| {
|
202
|
-
if let Ok(content) = String::try_convert(token
|
202
|
+
if let Ok(content) = String::try_convert(token) {
|
203
203
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
204
204
|
} else {
|
205
205
|
todo!()
|
@@ -268,9 +268,9 @@ impl RbTrainer {
|
|
268
268
|
WordLevelTrainer,
|
269
269
|
special_tokens,
|
270
270
|
special_tokens
|
271
|
-
.
|
271
|
+
.into_iter()
|
272
272
|
.map(|token| {
|
273
|
-
if let Ok(content) = String::try_convert(token
|
273
|
+
if let Ok(content) = String::try_convert(token) {
|
274
274
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
275
275
|
} else {
|
276
276
|
todo!()
|
@@ -322,9 +322,9 @@ impl RbTrainer {
|
|
322
322
|
WordPieceTrainer,
|
323
323
|
@set_special_tokens,
|
324
324
|
special_tokens
|
325
|
-
.
|
325
|
+
.into_iter()
|
326
326
|
.map(|token| {
|
327
|
-
if let Ok(content) = String::try_convert(token
|
327
|
+
if let Ok(content) = String::try_convert(token) {
|
328
328
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
329
329
|
} else {
|
330
330
|
todo!()
|
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
|
|
398
398
|
if !value.is_nil() {
|
399
399
|
builder = builder.special_tokens(
|
400
400
|
RArray::try_convert(value)?
|
401
|
-
.
|
401
|
+
.into_iter()
|
402
402
|
.map(|token| {
|
403
|
-
if let Ok(content) = String::try_convert(token
|
403
|
+
if let Ok(content) = String::try_convert(token) {
|
404
404
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
405
405
|
} else {
|
406
406
|
todo!()
|
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
|
|
466
466
|
if !value.is_nil() {
|
467
467
|
builder.special_tokens(
|
468
468
|
RArray::try_convert(value)?
|
469
|
-
.
|
469
|
+
.into_iter()
|
470
470
|
.map(|token| {
|
471
|
-
if let Ok(content) = String::try_convert(token
|
471
|
+
if let Ok(content) = String::try_convert(token) {
|
472
472
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
473
473
|
} else {
|
474
474
|
todo!()
|
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
|
|
540
540
|
if !value.is_nil() {
|
541
541
|
builder.special_tokens(
|
542
542
|
RArray::try_convert(value)?
|
543
|
-
.
|
543
|
+
.into_iter()
|
544
544
|
.map(|token| {
|
545
|
-
if let Ok(content) = String::try_convert(token
|
545
|
+
if let Ok(content) = String::try_convert(token) {
|
546
546
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
547
547
|
} else {
|
548
548
|
todo!()
|
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
|
|
581
581
|
if !value.is_nil() {
|
582
582
|
builder = builder.special_tokens(
|
583
583
|
RArray::try_convert(value)?
|
584
|
-
.
|
584
|
+
.into_iter()
|
585
585
|
.map(|token| {
|
586
|
-
if let Ok(content) = String::try_convert(token
|
586
|
+
if let Ok(content) = String::try_convert(token) {
|
587
587
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
588
588
|
} else {
|
589
589
|
todo!()
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module FromPretrained
|
3
3
|
# for user agent
|
4
|
-
TOKENIZERS_VERSION = "0.
|
4
|
+
TOKENIZERS_VERSION = "0.20.0"
|
5
5
|
|
6
6
|
# use Ruby for downloads
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
@@ -67,7 +67,7 @@ module Tokenizers
|
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
|
-
options[:content_length_proc] = ->
|
70
|
+
options[:content_length_proc] = ->(_) { puts "Downloading..." }
|
71
71
|
|
72
72
|
# string options are headers
|
73
73
|
tempfile = URI.parse(url).open(headers.merge(options))
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.5.
|
103
|
+
rubygems_version: 3.5.11
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|