tokenizers 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +167 -177
- data/ext/tokenizers/Cargo.toml +3 -3
- data/ext/tokenizers/src/decoders.rs +7 -0
- data/ext/tokenizers/src/lib.rs +7 -0
- data/ext/tokenizers/src/normalizers.rs +2 -2
- data/ext/tokenizers/src/pre_tokenizers.rs +2 -2
- data/ext/tokenizers/src/tokenizer.rs +45 -12
- data/ext/tokenizers/src/trainers.rs +16 -16
- data/lib/tokenizers/from_pretrained.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
|
|
4
|
+
data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
|
|
7
|
+
data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
## 0.5.2 (2024-08-26)
|
|
2
|
+
|
|
3
|
+
- Added `from_str` method to `Tokenizer`
|
|
4
|
+
- Added `model` and `model=` methods to `Tokenizer`
|
|
5
|
+
- Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
|
|
6
|
+
- Added `decode` method to `Decoder`
|
|
7
|
+
|
|
8
|
+
## 0.5.1 (2024-08-13)
|
|
9
|
+
|
|
10
|
+
- Updated Tokenizers to 0.20.0
|
|
11
|
+
- Added precompiled gem for Linux ARM MUSL
|
|
12
|
+
|
|
1
13
|
## 0.5.0 (2024-05-21)
|
|
2
14
|
|
|
3
15
|
- Updated Tokenizers to 0.19.1
|
data/Cargo.lock
CHANGED
|
@@ -4,19 +4,13 @@ version = 3
|
|
|
4
4
|
|
|
5
5
|
[[package]]
|
|
6
6
|
name = "aho-corasick"
|
|
7
|
-
version = "1.1.
|
|
7
|
+
version = "1.1.3"
|
|
8
8
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
9
|
-
checksum = "
|
|
9
|
+
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
|
10
10
|
dependencies = [
|
|
11
11
|
"memchr",
|
|
12
12
|
]
|
|
13
13
|
|
|
14
|
-
[[package]]
|
|
15
|
-
name = "autocfg"
|
|
16
|
-
version = "1.1.0"
|
|
17
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
18
|
-
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
|
19
|
-
|
|
20
14
|
[[package]]
|
|
21
15
|
name = "base64"
|
|
22
16
|
version = "0.13.1"
|
|
@@ -25,16 +19,16 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
|
25
19
|
|
|
26
20
|
[[package]]
|
|
27
21
|
name = "bindgen"
|
|
28
|
-
version = "0.69.
|
|
22
|
+
version = "0.69.4"
|
|
29
23
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
30
|
-
checksum = "
|
|
24
|
+
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
|
|
31
25
|
dependencies = [
|
|
32
|
-
"bitflags 2.
|
|
26
|
+
"bitflags 2.6.0",
|
|
33
27
|
"cexpr",
|
|
34
28
|
"clang-sys",
|
|
29
|
+
"itertools 0.12.1",
|
|
35
30
|
"lazy_static",
|
|
36
31
|
"lazycell",
|
|
37
|
-
"peeking_take_while",
|
|
38
32
|
"proc-macro2",
|
|
39
33
|
"quote",
|
|
40
34
|
"regex",
|
|
@@ -51,15 +45,24 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
|
51
45
|
|
|
52
46
|
[[package]]
|
|
53
47
|
name = "bitflags"
|
|
54
|
-
version = "2.
|
|
48
|
+
version = "2.6.0"
|
|
55
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
56
|
-
checksum = "
|
|
50
|
+
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
|
51
|
+
|
|
52
|
+
[[package]]
|
|
53
|
+
name = "byteorder"
|
|
54
|
+
version = "1.5.0"
|
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
56
|
+
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
|
57
57
|
|
|
58
58
|
[[package]]
|
|
59
59
|
name = "cc"
|
|
60
|
-
version = "1.
|
|
60
|
+
version = "1.1.15"
|
|
61
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
62
|
-
checksum = "
|
|
62
|
+
checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
|
|
63
|
+
dependencies = [
|
|
64
|
+
"shlex",
|
|
65
|
+
]
|
|
63
66
|
|
|
64
67
|
[[package]]
|
|
65
68
|
name = "cexpr"
|
|
@@ -78,9 +81,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
|
78
81
|
|
|
79
82
|
[[package]]
|
|
80
83
|
name = "clang-sys"
|
|
81
|
-
version = "1.
|
|
84
|
+
version = "1.8.1"
|
|
82
85
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
83
|
-
checksum = "
|
|
86
|
+
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
|
|
84
87
|
dependencies = [
|
|
85
88
|
"glob",
|
|
86
89
|
"libc",
|
|
@@ -89,9 +92,9 @@ dependencies = [
|
|
|
89
92
|
|
|
90
93
|
[[package]]
|
|
91
94
|
name = "console"
|
|
92
|
-
version = "0.15.
|
|
95
|
+
version = "0.15.8"
|
|
93
96
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
94
|
-
checksum = "
|
|
97
|
+
checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
|
|
95
98
|
dependencies = [
|
|
96
99
|
"encode_unicode",
|
|
97
100
|
"lazy_static",
|
|
@@ -102,42 +105,34 @@ dependencies = [
|
|
|
102
105
|
|
|
103
106
|
[[package]]
|
|
104
107
|
name = "crossbeam-deque"
|
|
105
|
-
version = "0.8.
|
|
108
|
+
version = "0.8.5"
|
|
106
109
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
107
|
-
checksum = "
|
|
110
|
+
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
|
|
108
111
|
dependencies = [
|
|
109
|
-
"cfg-if",
|
|
110
112
|
"crossbeam-epoch",
|
|
111
113
|
"crossbeam-utils",
|
|
112
114
|
]
|
|
113
115
|
|
|
114
116
|
[[package]]
|
|
115
117
|
name = "crossbeam-epoch"
|
|
116
|
-
version = "0.9.
|
|
118
|
+
version = "0.9.18"
|
|
117
119
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
118
|
-
checksum = "
|
|
120
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
|
119
121
|
dependencies = [
|
|
120
|
-
"autocfg",
|
|
121
|
-
"cfg-if",
|
|
122
122
|
"crossbeam-utils",
|
|
123
|
-
"memoffset",
|
|
124
|
-
"scopeguard",
|
|
125
123
|
]
|
|
126
124
|
|
|
127
125
|
[[package]]
|
|
128
126
|
name = "crossbeam-utils"
|
|
129
|
-
version = "0.8.
|
|
127
|
+
version = "0.8.20"
|
|
130
128
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
131
|
-
checksum = "
|
|
132
|
-
dependencies = [
|
|
133
|
-
"cfg-if",
|
|
134
|
-
]
|
|
129
|
+
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
|
|
135
130
|
|
|
136
131
|
[[package]]
|
|
137
132
|
name = "darling"
|
|
138
|
-
version = "0.20.
|
|
133
|
+
version = "0.20.10"
|
|
139
134
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
140
|
-
checksum = "
|
|
135
|
+
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
|
|
141
136
|
dependencies = [
|
|
142
137
|
"darling_core",
|
|
143
138
|
"darling_macro",
|
|
@@ -145,9 +140,9 @@ dependencies = [
|
|
|
145
140
|
|
|
146
141
|
[[package]]
|
|
147
142
|
name = "darling_core"
|
|
148
|
-
version = "0.20.
|
|
143
|
+
version = "0.20.10"
|
|
149
144
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
150
|
-
checksum = "
|
|
145
|
+
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
|
|
151
146
|
dependencies = [
|
|
152
147
|
"fnv",
|
|
153
148
|
"ident_case",
|
|
@@ -159,9 +154,9 @@ dependencies = [
|
|
|
159
154
|
|
|
160
155
|
[[package]]
|
|
161
156
|
name = "darling_macro"
|
|
162
|
-
version = "0.20.
|
|
157
|
+
version = "0.20.10"
|
|
163
158
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
164
|
-
checksum = "
|
|
159
|
+
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
|
165
160
|
dependencies = [
|
|
166
161
|
"darling_core",
|
|
167
162
|
"quote",
|
|
@@ -201,9 +196,9 @@ dependencies = [
|
|
|
201
196
|
|
|
202
197
|
[[package]]
|
|
203
198
|
name = "either"
|
|
204
|
-
version = "1.
|
|
199
|
+
version = "1.13.0"
|
|
205
200
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
206
|
-
checksum = "
|
|
201
|
+
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
|
207
202
|
|
|
208
203
|
[[package]]
|
|
209
204
|
name = "encode_unicode"
|
|
@@ -228,9 +223,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
|
228
223
|
|
|
229
224
|
[[package]]
|
|
230
225
|
name = "getrandom"
|
|
231
|
-
version = "0.2.
|
|
226
|
+
version = "0.2.15"
|
|
232
227
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
233
|
-
checksum = "
|
|
228
|
+
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
|
234
229
|
dependencies = [
|
|
235
230
|
"cfg-if",
|
|
236
231
|
"libc",
|
|
@@ -251,9 +246,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|
|
251
246
|
|
|
252
247
|
[[package]]
|
|
253
248
|
name = "indicatif"
|
|
254
|
-
version = "0.17.
|
|
249
|
+
version = "0.17.8"
|
|
255
250
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
256
|
-
checksum = "
|
|
251
|
+
checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
|
|
257
252
|
dependencies = [
|
|
258
253
|
"console",
|
|
259
254
|
"instant",
|
|
@@ -264,9 +259,9 @@ dependencies = [
|
|
|
264
259
|
|
|
265
260
|
[[package]]
|
|
266
261
|
name = "instant"
|
|
267
|
-
version = "0.1.
|
|
262
|
+
version = "0.1.13"
|
|
268
263
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
269
|
-
checksum = "
|
|
264
|
+
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
|
270
265
|
dependencies = [
|
|
271
266
|
"cfg-if",
|
|
272
267
|
]
|
|
@@ -291,15 +286,15 @@ dependencies = [
|
|
|
291
286
|
|
|
292
287
|
[[package]]
|
|
293
288
|
name = "itoa"
|
|
294
|
-
version = "1.0.
|
|
289
|
+
version = "1.0.11"
|
|
295
290
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
296
|
-
checksum = "
|
|
291
|
+
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
|
297
292
|
|
|
298
293
|
[[package]]
|
|
299
294
|
name = "lazy_static"
|
|
300
|
-
version = "1.
|
|
295
|
+
version = "1.5.0"
|
|
301
296
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
302
|
-
checksum = "
|
|
297
|
+
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
|
303
298
|
|
|
304
299
|
[[package]]
|
|
305
300
|
name = "lazycell"
|
|
@@ -309,28 +304,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
|
309
304
|
|
|
310
305
|
[[package]]
|
|
311
306
|
name = "libc"
|
|
312
|
-
version = "0.2.
|
|
307
|
+
version = "0.2.158"
|
|
313
308
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
314
|
-
checksum = "
|
|
309
|
+
checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
|
|
315
310
|
|
|
316
311
|
[[package]]
|
|
317
312
|
name = "libloading"
|
|
318
|
-
version = "0.
|
|
313
|
+
version = "0.8.5"
|
|
319
314
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
320
|
-
checksum = "
|
|
315
|
+
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
|
|
321
316
|
dependencies = [
|
|
322
317
|
"cfg-if",
|
|
323
|
-
"
|
|
318
|
+
"windows-targets",
|
|
324
319
|
]
|
|
325
320
|
|
|
326
321
|
[[package]]
|
|
327
322
|
name = "log"
|
|
328
|
-
version = "0.4.
|
|
323
|
+
version = "0.4.22"
|
|
329
324
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
330
|
-
checksum = "
|
|
331
|
-
dependencies = [
|
|
332
|
-
"cfg-if",
|
|
333
|
-
]
|
|
325
|
+
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
|
334
326
|
|
|
335
327
|
[[package]]
|
|
336
328
|
name = "macro_rules_attribute"
|
|
@@ -350,9 +342,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
|
350
342
|
|
|
351
343
|
[[package]]
|
|
352
344
|
name = "magnus"
|
|
353
|
-
version = "0.
|
|
345
|
+
version = "0.7.1"
|
|
354
346
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
355
|
-
checksum = "
|
|
347
|
+
checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
|
|
356
348
|
dependencies = [
|
|
357
349
|
"magnus-macros",
|
|
358
350
|
"rb-sys",
|
|
@@ -373,18 +365,9 @@ dependencies = [
|
|
|
373
365
|
|
|
374
366
|
[[package]]
|
|
375
367
|
name = "memchr"
|
|
376
|
-
version = "2.
|
|
377
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
378
|
-
checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
|
|
379
|
-
|
|
380
|
-
[[package]]
|
|
381
|
-
name = "memoffset"
|
|
382
|
-
version = "0.8.0"
|
|
368
|
+
version = "2.7.4"
|
|
383
369
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
384
|
-
checksum = "
|
|
385
|
-
dependencies = [
|
|
386
|
-
"autocfg",
|
|
387
|
-
]
|
|
370
|
+
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
|
388
371
|
|
|
389
372
|
[[package]]
|
|
390
373
|
name = "minimal-lexical"
|
|
@@ -394,9 +377,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
|
394
377
|
|
|
395
378
|
[[package]]
|
|
396
379
|
name = "monostate"
|
|
397
|
-
version = "0.1.
|
|
380
|
+
version = "0.1.13"
|
|
398
381
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
399
|
-
checksum = "
|
|
382
|
+
checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
|
|
400
383
|
dependencies = [
|
|
401
384
|
"monostate-impl",
|
|
402
385
|
"serde",
|
|
@@ -404,9 +387,9 @@ dependencies = [
|
|
|
404
387
|
|
|
405
388
|
[[package]]
|
|
406
389
|
name = "monostate-impl"
|
|
407
|
-
version = "0.1.
|
|
390
|
+
version = "0.1.13"
|
|
408
391
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
409
|
-
checksum = "
|
|
392
|
+
checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
|
|
410
393
|
dependencies = [
|
|
411
394
|
"proc-macro2",
|
|
412
395
|
"quote",
|
|
@@ -431,9 +414,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
|
|
431
414
|
|
|
432
415
|
[[package]]
|
|
433
416
|
name = "once_cell"
|
|
434
|
-
version = "1.
|
|
417
|
+
version = "1.19.0"
|
|
435
418
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
436
|
-
checksum = "
|
|
419
|
+
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
|
437
420
|
|
|
438
421
|
[[package]]
|
|
439
422
|
name = "onig"
|
|
@@ -459,48 +442,45 @@ dependencies = [
|
|
|
459
442
|
|
|
460
443
|
[[package]]
|
|
461
444
|
name = "paste"
|
|
462
|
-
version = "1.0.
|
|
445
|
+
version = "1.0.15"
|
|
463
446
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
464
|
-
checksum = "
|
|
465
|
-
|
|
466
|
-
[[package]]
|
|
467
|
-
name = "peeking_take_while"
|
|
468
|
-
version = "0.1.2"
|
|
469
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
470
|
-
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
|
447
|
+
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
|
471
448
|
|
|
472
449
|
[[package]]
|
|
473
450
|
name = "pkg-config"
|
|
474
|
-
version = "0.3.
|
|
451
|
+
version = "0.3.30"
|
|
475
452
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
476
|
-
checksum = "
|
|
453
|
+
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
|
|
477
454
|
|
|
478
455
|
[[package]]
|
|
479
456
|
name = "portable-atomic"
|
|
480
|
-
version = "1.
|
|
457
|
+
version = "1.7.0"
|
|
481
458
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
482
|
-
checksum = "
|
|
459
|
+
checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
|
|
483
460
|
|
|
484
461
|
[[package]]
|
|
485
462
|
name = "ppv-lite86"
|
|
486
|
-
version = "0.2.
|
|
463
|
+
version = "0.2.20"
|
|
487
464
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
488
|
-
checksum = "
|
|
465
|
+
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
|
466
|
+
dependencies = [
|
|
467
|
+
"zerocopy",
|
|
468
|
+
]
|
|
489
469
|
|
|
490
470
|
[[package]]
|
|
491
471
|
name = "proc-macro2"
|
|
492
|
-
version = "1.0.
|
|
472
|
+
version = "1.0.86"
|
|
493
473
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
494
|
-
checksum = "
|
|
474
|
+
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
|
|
495
475
|
dependencies = [
|
|
496
476
|
"unicode-ident",
|
|
497
477
|
]
|
|
498
478
|
|
|
499
479
|
[[package]]
|
|
500
480
|
name = "quote"
|
|
501
|
-
version = "1.0.
|
|
481
|
+
version = "1.0.37"
|
|
502
482
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
503
|
-
checksum = "
|
|
483
|
+
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
|
504
484
|
dependencies = [
|
|
505
485
|
"proc-macro2",
|
|
506
486
|
]
|
|
@@ -568,18 +548,18 @@ dependencies = [
|
|
|
568
548
|
|
|
569
549
|
[[package]]
|
|
570
550
|
name = "rb-sys"
|
|
571
|
-
version = "0.9.
|
|
551
|
+
version = "0.9.102"
|
|
572
552
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
573
|
-
checksum = "
|
|
553
|
+
checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
|
|
574
554
|
dependencies = [
|
|
575
555
|
"rb-sys-build",
|
|
576
556
|
]
|
|
577
557
|
|
|
578
558
|
[[package]]
|
|
579
559
|
name = "rb-sys-build"
|
|
580
|
-
version = "0.9.
|
|
560
|
+
version = "0.9.102"
|
|
581
561
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
582
|
-
checksum = "
|
|
562
|
+
checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
|
|
583
563
|
dependencies = [
|
|
584
564
|
"bindgen",
|
|
585
565
|
"lazy_static",
|
|
@@ -598,9 +578,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
|
598
578
|
|
|
599
579
|
[[package]]
|
|
600
580
|
name = "regex"
|
|
601
|
-
version = "1.10.
|
|
581
|
+
version = "1.10.6"
|
|
602
582
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
603
|
-
checksum = "
|
|
583
|
+
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
|
|
604
584
|
dependencies = [
|
|
605
585
|
"aho-corasick",
|
|
606
586
|
"memchr",
|
|
@@ -610,9 +590,9 @@ dependencies = [
|
|
|
610
590
|
|
|
611
591
|
[[package]]
|
|
612
592
|
name = "regex-automata"
|
|
613
|
-
version = "0.4.
|
|
593
|
+
version = "0.4.7"
|
|
614
594
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
615
|
-
checksum = "
|
|
595
|
+
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
|
616
596
|
dependencies = [
|
|
617
597
|
"aho-corasick",
|
|
618
598
|
"memchr",
|
|
@@ -621,9 +601,9 @@ dependencies = [
|
|
|
621
601
|
|
|
622
602
|
[[package]]
|
|
623
603
|
name = "regex-syntax"
|
|
624
|
-
version = "0.8.
|
|
604
|
+
version = "0.8.4"
|
|
625
605
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
626
|
-
checksum = "
|
|
606
|
+
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
|
627
607
|
|
|
628
608
|
[[package]]
|
|
629
609
|
name = "rustc-hash"
|
|
@@ -633,15 +613,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
|
633
613
|
|
|
634
614
|
[[package]]
|
|
635
615
|
name = "ryu"
|
|
636
|
-
version = "1.0.
|
|
637
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
638
|
-
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
|
|
639
|
-
|
|
640
|
-
[[package]]
|
|
641
|
-
name = "scopeguard"
|
|
642
|
-
version = "1.1.0"
|
|
616
|
+
version = "1.0.18"
|
|
643
617
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
644
|
-
checksum = "
|
|
618
|
+
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
|
645
619
|
|
|
646
620
|
[[package]]
|
|
647
621
|
name = "seq-macro"
|
|
@@ -651,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
|
651
625
|
|
|
652
626
|
[[package]]
|
|
653
627
|
name = "serde"
|
|
654
|
-
version = "1.0.
|
|
628
|
+
version = "1.0.209"
|
|
655
629
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
656
|
-
checksum = "
|
|
630
|
+
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
|
657
631
|
dependencies = [
|
|
658
632
|
"serde_derive",
|
|
659
633
|
]
|
|
660
634
|
|
|
661
635
|
[[package]]
|
|
662
636
|
name = "serde_derive"
|
|
663
|
-
version = "1.0.
|
|
637
|
+
version = "1.0.209"
|
|
664
638
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
665
|
-
checksum = "
|
|
639
|
+
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
|
666
640
|
dependencies = [
|
|
667
641
|
"proc-macro2",
|
|
668
642
|
"quote",
|
|
@@ -671,11 +645,12 @@ dependencies = [
|
|
|
671
645
|
|
|
672
646
|
[[package]]
|
|
673
647
|
name = "serde_json"
|
|
674
|
-
version = "1.0.
|
|
648
|
+
version = "1.0.127"
|
|
675
649
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
676
|
-
checksum = "
|
|
650
|
+
checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
|
|
677
651
|
dependencies = [
|
|
678
652
|
"itoa",
|
|
653
|
+
"memchr",
|
|
679
654
|
"ryu",
|
|
680
655
|
"serde",
|
|
681
656
|
]
|
|
@@ -688,15 +663,15 @@ checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
|
|
|
688
663
|
|
|
689
664
|
[[package]]
|
|
690
665
|
name = "shlex"
|
|
691
|
-
version = "1.
|
|
666
|
+
version = "1.3.0"
|
|
692
667
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
693
|
-
checksum = "
|
|
668
|
+
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
|
694
669
|
|
|
695
670
|
[[package]]
|
|
696
671
|
name = "smallvec"
|
|
697
|
-
version = "1.
|
|
672
|
+
version = "1.13.2"
|
|
698
673
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
699
|
-
checksum = "
|
|
674
|
+
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
|
700
675
|
|
|
701
676
|
[[package]]
|
|
702
677
|
name = "spm_precompiled"
|
|
@@ -712,15 +687,15 @@ dependencies = [
|
|
|
712
687
|
|
|
713
688
|
[[package]]
|
|
714
689
|
name = "strsim"
|
|
715
|
-
version = "0.
|
|
690
|
+
version = "0.11.1"
|
|
716
691
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
717
|
-
checksum = "
|
|
692
|
+
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
718
693
|
|
|
719
694
|
[[package]]
|
|
720
695
|
name = "syn"
|
|
721
|
-
version = "2.0.
|
|
696
|
+
version = "2.0.76"
|
|
722
697
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
723
|
-
checksum = "
|
|
698
|
+
checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
|
|
724
699
|
dependencies = [
|
|
725
700
|
"proc-macro2",
|
|
726
701
|
"quote",
|
|
@@ -729,18 +704,18 @@ dependencies = [
|
|
|
729
704
|
|
|
730
705
|
[[package]]
|
|
731
706
|
name = "thiserror"
|
|
732
|
-
version = "1.0.
|
|
707
|
+
version = "1.0.63"
|
|
733
708
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
734
|
-
checksum = "
|
|
709
|
+
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
|
|
735
710
|
dependencies = [
|
|
736
711
|
"thiserror-impl",
|
|
737
712
|
]
|
|
738
713
|
|
|
739
714
|
[[package]]
|
|
740
715
|
name = "thiserror-impl"
|
|
741
|
-
version = "1.0.
|
|
716
|
+
version = "1.0.63"
|
|
742
717
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
743
|
-
checksum = "
|
|
718
|
+
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
|
|
744
719
|
dependencies = [
|
|
745
720
|
"proc-macro2",
|
|
746
721
|
"quote",
|
|
@@ -749,19 +724,19 @@ dependencies = [
|
|
|
749
724
|
|
|
750
725
|
[[package]]
|
|
751
726
|
name = "tokenizers"
|
|
752
|
-
version = "0.5.
|
|
727
|
+
version = "0.5.2"
|
|
753
728
|
dependencies = [
|
|
754
729
|
"magnus",
|
|
755
730
|
"onig",
|
|
756
731
|
"serde",
|
|
757
|
-
"tokenizers 0.
|
|
732
|
+
"tokenizers 0.20.0",
|
|
758
733
|
]
|
|
759
734
|
|
|
760
735
|
[[package]]
|
|
761
736
|
name = "tokenizers"
|
|
762
|
-
version = "0.
|
|
737
|
+
version = "0.20.0"
|
|
763
738
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
764
|
-
checksum = "
|
|
739
|
+
checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
|
|
765
740
|
dependencies = [
|
|
766
741
|
"aho-corasick",
|
|
767
742
|
"derive_builder",
|
|
@@ -791,9 +766,9 @@ dependencies = [
|
|
|
791
766
|
|
|
792
767
|
[[package]]
|
|
793
768
|
name = "unicode-ident"
|
|
794
|
-
version = "1.0.
|
|
769
|
+
version = "1.0.12"
|
|
795
770
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
796
|
-
checksum = "
|
|
771
|
+
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
|
797
772
|
|
|
798
773
|
[[package]]
|
|
799
774
|
name = "unicode-normalization-alignments"
|
|
@@ -812,9 +787,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
|
|
812
787
|
|
|
813
788
|
[[package]]
|
|
814
789
|
name = "unicode-width"
|
|
815
|
-
version = "0.1.
|
|
790
|
+
version = "0.1.13"
|
|
816
791
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
817
|
-
checksum = "
|
|
792
|
+
checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
|
|
818
793
|
|
|
819
794
|
[[package]]
|
|
820
795
|
name = "unicode_categories"
|
|
@@ -829,36 +804,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
829
804
|
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
|
830
805
|
|
|
831
806
|
[[package]]
|
|
832
|
-
name = "
|
|
833
|
-
version = "0.
|
|
807
|
+
name = "windows-sys"
|
|
808
|
+
version = "0.52.0"
|
|
834
809
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
835
|
-
checksum = "
|
|
810
|
+
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
|
836
811
|
dependencies = [
|
|
837
|
-
"
|
|
838
|
-
"winapi-x86_64-pc-windows-gnu",
|
|
812
|
+
"windows-targets",
|
|
839
813
|
]
|
|
840
814
|
|
|
841
815
|
[[package]]
|
|
842
|
-
name = "
|
|
843
|
-
version = "0.
|
|
816
|
+
name = "windows-targets"
|
|
817
|
+
version = "0.52.6"
|
|
844
818
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
845
|
-
checksum = "
|
|
846
|
-
|
|
847
|
-
[[package]]
|
|
848
|
-
name = "winapi-x86_64-pc-windows-gnu"
|
|
849
|
-
version = "0.4.0"
|
|
850
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
851
|
-
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
|
852
|
-
|
|
853
|
-
[[package]]
|
|
854
|
-
name = "windows-sys"
|
|
855
|
-
version = "0.42.0"
|
|
856
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
857
|
-
checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
|
|
819
|
+
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
|
858
820
|
dependencies = [
|
|
859
821
|
"windows_aarch64_gnullvm",
|
|
860
822
|
"windows_aarch64_msvc",
|
|
861
823
|
"windows_i686_gnu",
|
|
824
|
+
"windows_i686_gnullvm",
|
|
862
825
|
"windows_i686_msvc",
|
|
863
826
|
"windows_x86_64_gnu",
|
|
864
827
|
"windows_x86_64_gnullvm",
|
|
@@ -867,42 +830,69 @@ dependencies = [
|
|
|
867
830
|
|
|
868
831
|
[[package]]
|
|
869
832
|
name = "windows_aarch64_gnullvm"
|
|
870
|
-
version = "0.
|
|
833
|
+
version = "0.52.6"
|
|
871
834
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
872
|
-
checksum = "
|
|
835
|
+
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
|
873
836
|
|
|
874
837
|
[[package]]
|
|
875
838
|
name = "windows_aarch64_msvc"
|
|
876
|
-
version = "0.
|
|
839
|
+
version = "0.52.6"
|
|
877
840
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
878
|
-
checksum = "
|
|
841
|
+
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
|
879
842
|
|
|
880
843
|
[[package]]
|
|
881
844
|
name = "windows_i686_gnu"
|
|
882
|
-
version = "0.
|
|
845
|
+
version = "0.52.6"
|
|
846
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
847
|
+
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
|
848
|
+
|
|
849
|
+
[[package]]
|
|
850
|
+
name = "windows_i686_gnullvm"
|
|
851
|
+
version = "0.52.6"
|
|
883
852
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
884
|
-
checksum = "
|
|
853
|
+
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
|
885
854
|
|
|
886
855
|
[[package]]
|
|
887
856
|
name = "windows_i686_msvc"
|
|
888
|
-
version = "0.
|
|
857
|
+
version = "0.52.6"
|
|
889
858
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
890
|
-
checksum = "
|
|
859
|
+
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
|
891
860
|
|
|
892
861
|
[[package]]
|
|
893
862
|
name = "windows_x86_64_gnu"
|
|
894
|
-
version = "0.
|
|
863
|
+
version = "0.52.6"
|
|
895
864
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
896
|
-
checksum = "
|
|
865
|
+
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
|
897
866
|
|
|
898
867
|
[[package]]
|
|
899
868
|
name = "windows_x86_64_gnullvm"
|
|
900
|
-
version = "0.
|
|
869
|
+
version = "0.52.6"
|
|
901
870
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
902
|
-
checksum = "
|
|
871
|
+
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
|
903
872
|
|
|
904
873
|
[[package]]
|
|
905
874
|
name = "windows_x86_64_msvc"
|
|
906
|
-
version = "0.
|
|
875
|
+
version = "0.52.6"
|
|
876
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
877
|
+
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
|
878
|
+
|
|
879
|
+
[[package]]
|
|
880
|
+
name = "zerocopy"
|
|
881
|
+
version = "0.7.35"
|
|
882
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
883
|
+
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
|
884
|
+
dependencies = [
|
|
885
|
+
"byteorder",
|
|
886
|
+
"zerocopy-derive",
|
|
887
|
+
]
|
|
888
|
+
|
|
889
|
+
[[package]]
|
|
890
|
+
name = "zerocopy-derive"
|
|
891
|
+
version = "0.7.35"
|
|
907
892
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
908
|
-
checksum = "
|
|
893
|
+
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
|
894
|
+
dependencies = [
|
|
895
|
+
"proc-macro2",
|
|
896
|
+
"quote",
|
|
897
|
+
"syn",
|
|
898
|
+
]
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers"
|
|
3
|
-
version = "0.5.
|
|
3
|
+
version = "0.5.2"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
@@ -11,11 +11,11 @@ publish = false
|
|
|
11
11
|
crate-type = ["cdylib"]
|
|
12
12
|
|
|
13
13
|
[dependencies]
|
|
14
|
-
magnus = "0.
|
|
14
|
+
magnus = "0.7"
|
|
15
15
|
onig = { version = "6", default-features = false }
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
|
17
17
|
|
|
18
18
|
[dependencies.tokenizers]
|
|
19
|
-
version = "=0.
|
|
19
|
+
version = "=0.20.0" # also update in from_pretrained.rb
|
|
20
20
|
default-features = false
|
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
|
|
|
34
34
|
}
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
impl RbDecoder {
|
|
38
|
+
pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
|
|
39
|
+
self.decoder.decode(tokens).map_err(RbError::from)
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
37
43
|
macro_rules! getter {
|
|
38
44
|
($self: ident, $variant: ident, $($name: tt)+) => {{
|
|
39
45
|
let decoder = &$self.decoder;
|
|
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
|
|
|
358
364
|
|
|
359
365
|
pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
360
366
|
let decoder = module.define_class("Decoder", ruby.class_object())?;
|
|
367
|
+
decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
|
|
361
368
|
|
|
362
369
|
let class = module.define_class("BPEDecoder", decoder)?;
|
|
363
370
|
class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
|
data/ext/tokenizers/src/lib.rs
CHANGED
|
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
|
42
42
|
|
|
43
43
|
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
|
44
44
|
class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
|
|
45
|
+
class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
|
|
45
46
|
class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
|
46
47
|
class.define_method(
|
|
47
48
|
"add_special_tokens",
|
|
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
|
54
55
|
class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
|
|
55
56
|
class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
|
|
56
57
|
class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
|
|
58
|
+
class.define_method("model", method!(RbTokenizer::get_model, 0))?;
|
|
59
|
+
class.define_method("model=", method!(RbTokenizer::set_model,1))?;
|
|
60
|
+
class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
|
|
57
61
|
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
|
62
|
+
class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
|
|
58
63
|
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
|
64
|
+
class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
|
|
59
65
|
class.define_method(
|
|
60
66
|
"post_processor=",
|
|
61
67
|
method!(RbTokenizer::set_post_processor, 1),
|
|
62
68
|
)?;
|
|
69
|
+
class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
|
|
63
70
|
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
|
64
71
|
class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
|
|
65
72
|
class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
|
|
@@ -222,8 +222,8 @@ pub struct RbSequence {}
|
|
|
222
222
|
impl RbSequence {
|
|
223
223
|
fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
|
|
224
224
|
let mut sequence = Vec::with_capacity(normalizers.len());
|
|
225
|
-
for n in normalizers.
|
|
226
|
-
let normalizer: &RbNormalizer = TryConvert::try_convert(n
|
|
225
|
+
for n in normalizers.into_iter() {
|
|
226
|
+
let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
|
|
227
227
|
match &normalizer.normalizer {
|
|
228
228
|
RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
|
|
229
229
|
RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
|
|
@@ -258,8 +258,8 @@ pub struct RbSequence {}
|
|
|
258
258
|
impl RbSequence {
|
|
259
259
|
fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
|
|
260
260
|
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
|
261
|
-
for n in pre_tokenizers.
|
|
262
|
-
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n
|
|
261
|
+
for n in pre_tokenizers.into_iter() {
|
|
262
|
+
let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
|
|
263
263
|
match &pretokenizer.pretok {
|
|
264
264
|
RbPreTokenizerTypeWrapper::Sequence(inner) => {
|
|
265
265
|
sequence.extend(inner.iter().cloned())
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
use std::cell::RefCell;
|
|
2
2
|
use std::collections::HashMap;
|
|
3
3
|
use std::path::PathBuf;
|
|
4
|
+
use std::str::FromStr;
|
|
4
5
|
|
|
5
6
|
use magnus::prelude::*;
|
|
6
|
-
use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
|
|
7
|
+
use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
|
|
7
8
|
use tk::tokenizer::{
|
|
8
9
|
Model, PaddingDirection, PaddingParams, PaddingStrategy,
|
|
9
10
|
TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
|
|
@@ -203,6 +204,14 @@ impl RbTokenizer {
|
|
|
203
204
|
RbTokenizer::new(TokenizerImpl::new(model.clone()))
|
|
204
205
|
}
|
|
205
206
|
|
|
207
|
+
pub fn from_str(json: RString) -> RbResult<Self> {
|
|
208
|
+
Tokenizer::from_str(unsafe { json.as_str()? })
|
|
209
|
+
.map(|v| RbTokenizer {
|
|
210
|
+
tokenizer: RefCell::new(v),
|
|
211
|
+
})
|
|
212
|
+
.map_err(RbError::from)
|
|
213
|
+
}
|
|
214
|
+
|
|
206
215
|
pub fn from_file(path: PathBuf) -> RbResult<Self> {
|
|
207
216
|
Tokenizer::from_file(path)
|
|
208
217
|
.map(|v| RbTokenizer {
|
|
@@ -282,12 +291,12 @@ impl RbTokenizer {
|
|
|
282
291
|
add_special_tokens: bool,
|
|
283
292
|
) -> RbResult<RArray> {
|
|
284
293
|
let input: Vec<tk::EncodeInput> = input
|
|
285
|
-
.
|
|
294
|
+
.into_iter()
|
|
286
295
|
.map(|o| {
|
|
287
296
|
let input: tk::EncodeInput = if is_pretokenized {
|
|
288
|
-
PreTokenizedEncodeInput::try_convert(o
|
|
297
|
+
PreTokenizedEncodeInput::try_convert(o)?.into()
|
|
289
298
|
} else {
|
|
290
|
-
TextEncodeInput::try_convert(o
|
|
299
|
+
TextEncodeInput::try_convert(o)?.into()
|
|
291
300
|
};
|
|
292
301
|
Ok(input)
|
|
293
302
|
})
|
|
@@ -319,26 +328,50 @@ impl RbTokenizer {
|
|
|
319
328
|
.map_err(RbError::from)
|
|
320
329
|
}
|
|
321
330
|
|
|
322
|
-
pub fn
|
|
323
|
-
self.tokenizer.
|
|
331
|
+
pub fn get_model(&self) -> RbModel {
|
|
332
|
+
self.tokenizer.borrow().get_model().clone()
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
pub fn set_model(&self, model: &RbModel) {
|
|
336
|
+
self.tokenizer.borrow_mut().with_model(model.clone());
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
pub fn get_decoder(&self) -> Option<RbDecoder> {
|
|
340
|
+
self.tokenizer.borrow().get_decoder().cloned()
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
|
|
344
|
+
self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
|
|
348
|
+
self.tokenizer.borrow().get_pre_tokenizer().cloned()
|
|
324
349
|
}
|
|
325
350
|
|
|
326
|
-
pub fn set_pre_tokenizer(&self, pretok:
|
|
351
|
+
pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
|
|
327
352
|
self.tokenizer
|
|
328
353
|
.borrow_mut()
|
|
329
|
-
.with_pre_tokenizer(pretok.
|
|
354
|
+
.with_pre_tokenizer(pretok.cloned());
|
|
330
355
|
}
|
|
331
356
|
|
|
332
|
-
pub fn
|
|
357
|
+
pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
|
|
358
|
+
self.tokenizer.borrow().get_post_processor().cloned()
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
|
|
333
362
|
self.tokenizer
|
|
334
363
|
.borrow_mut()
|
|
335
|
-
.with_post_processor(processor.
|
|
364
|
+
.with_post_processor(processor.cloned());
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
pub fn get_normalizer(&self) -> Option<RbNormalizer> {
|
|
368
|
+
self.tokenizer.borrow().get_normalizer().cloned()
|
|
336
369
|
}
|
|
337
370
|
|
|
338
|
-
pub fn set_normalizer(&self, normalizer:
|
|
371
|
+
pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
|
|
339
372
|
self.tokenizer
|
|
340
373
|
.borrow_mut()
|
|
341
|
-
.with_normalizer(normalizer.
|
|
374
|
+
.with_normalizer(normalizer.cloned());
|
|
342
375
|
}
|
|
343
376
|
|
|
344
377
|
pub fn token_to_id(&self, token: String) -> Option<u32> {
|
|
@@ -110,9 +110,9 @@ impl RbTrainer {
|
|
|
110
110
|
BpeTrainer,
|
|
111
111
|
special_tokens,
|
|
112
112
|
special_tokens
|
|
113
|
-
.
|
|
113
|
+
.into_iter()
|
|
114
114
|
.map(|token| {
|
|
115
|
-
if let Ok(content) = String::try_convert(token
|
|
115
|
+
if let Ok(content) = String::try_convert(token) {
|
|
116
116
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
117
117
|
} else {
|
|
118
118
|
todo!()
|
|
@@ -197,9 +197,9 @@ impl RbTrainer {
|
|
|
197
197
|
UnigramTrainer,
|
|
198
198
|
special_tokens,
|
|
199
199
|
special_tokens
|
|
200
|
-
.
|
|
200
|
+
.into_iter()
|
|
201
201
|
.map(|token| {
|
|
202
|
-
if let Ok(content) = String::try_convert(token
|
|
202
|
+
if let Ok(content) = String::try_convert(token) {
|
|
203
203
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
204
204
|
} else {
|
|
205
205
|
todo!()
|
|
@@ -268,9 +268,9 @@ impl RbTrainer {
|
|
|
268
268
|
WordLevelTrainer,
|
|
269
269
|
special_tokens,
|
|
270
270
|
special_tokens
|
|
271
|
-
.
|
|
271
|
+
.into_iter()
|
|
272
272
|
.map(|token| {
|
|
273
|
-
if let Ok(content) = String::try_convert(token
|
|
273
|
+
if let Ok(content) = String::try_convert(token) {
|
|
274
274
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
275
275
|
} else {
|
|
276
276
|
todo!()
|
|
@@ -322,9 +322,9 @@ impl RbTrainer {
|
|
|
322
322
|
WordPieceTrainer,
|
|
323
323
|
@set_special_tokens,
|
|
324
324
|
special_tokens
|
|
325
|
-
.
|
|
325
|
+
.into_iter()
|
|
326
326
|
.map(|token| {
|
|
327
|
-
if let Ok(content) = String::try_convert(token
|
|
327
|
+
if let Ok(content) = String::try_convert(token) {
|
|
328
328
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
329
329
|
} else {
|
|
330
330
|
todo!()
|
|
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
|
|
|
398
398
|
if !value.is_nil() {
|
|
399
399
|
builder = builder.special_tokens(
|
|
400
400
|
RArray::try_convert(value)?
|
|
401
|
-
.
|
|
401
|
+
.into_iter()
|
|
402
402
|
.map(|token| {
|
|
403
|
-
if let Ok(content) = String::try_convert(token
|
|
403
|
+
if let Ok(content) = String::try_convert(token) {
|
|
404
404
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
405
405
|
} else {
|
|
406
406
|
todo!()
|
|
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
|
|
|
466
466
|
if !value.is_nil() {
|
|
467
467
|
builder.special_tokens(
|
|
468
468
|
RArray::try_convert(value)?
|
|
469
|
-
.
|
|
469
|
+
.into_iter()
|
|
470
470
|
.map(|token| {
|
|
471
|
-
if let Ok(content) = String::try_convert(token
|
|
471
|
+
if let Ok(content) = String::try_convert(token) {
|
|
472
472
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
473
473
|
} else {
|
|
474
474
|
todo!()
|
|
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
|
|
|
540
540
|
if !value.is_nil() {
|
|
541
541
|
builder.special_tokens(
|
|
542
542
|
RArray::try_convert(value)?
|
|
543
|
-
.
|
|
543
|
+
.into_iter()
|
|
544
544
|
.map(|token| {
|
|
545
|
-
if let Ok(content) = String::try_convert(token
|
|
545
|
+
if let Ok(content) = String::try_convert(token) {
|
|
546
546
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
547
547
|
} else {
|
|
548
548
|
todo!()
|
|
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
|
|
|
581
581
|
if !value.is_nil() {
|
|
582
582
|
builder = builder.special_tokens(
|
|
583
583
|
RArray::try_convert(value)?
|
|
584
|
-
.
|
|
584
|
+
.into_iter()
|
|
585
585
|
.map(|token| {
|
|
586
|
-
if let Ok(content) = String::try_convert(token
|
|
586
|
+
if let Ok(content) = String::try_convert(token) {
|
|
587
587
|
Ok(RbAddedToken::from(content, Some(true)).get_token())
|
|
588
588
|
} else {
|
|
589
589
|
todo!()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module Tokenizers
|
|
2
2
|
module FromPretrained
|
|
3
3
|
# for user agent
|
|
4
|
-
TOKENIZERS_VERSION = "0.
|
|
4
|
+
TOKENIZERS_VERSION = "0.20.0"
|
|
5
5
|
|
|
6
6
|
# use Ruby for downloads
|
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
|
@@ -67,7 +67,7 @@ module Tokenizers
|
|
|
67
67
|
end
|
|
68
68
|
end
|
|
69
69
|
|
|
70
|
-
options[:content_length_proc] = ->
|
|
70
|
+
options[:content_length_proc] = ->(_) { puts "Downloading..." }
|
|
71
71
|
|
|
72
72
|
# string options are headers
|
|
73
73
|
tempfile = URI.parse(url).open(headers.merge(options))
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-08-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
100
100
|
- !ruby/object:Gem::Version
|
|
101
101
|
version: '0'
|
|
102
102
|
requirements: []
|
|
103
|
-
rubygems_version: 3.5.
|
|
103
|
+
rubygems_version: 3.5.11
|
|
104
104
|
signing_key:
|
|
105
105
|
specification_version: 4
|
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|