tokenizers 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
4
- data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
3
+ metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
4
+ data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
5
5
  SHA512:
6
- metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
7
- data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
6
+ metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
7
+ data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 0.5.2 (2024-08-26)
2
+
3
+ - Added `from_str` method to `Tokenizer`
4
+ - Added `model` and `model=` methods to `Tokenizer`
5
+ - Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
6
+ - Added `decode` method to `Decoder`
7
+
8
+ ## 0.5.1 (2024-08-13)
9
+
10
+ - Updated Tokenizers to 0.20.0
11
+ - Added precompiled gem for Linux ARM MUSL
12
+
1
13
  ## 0.5.0 (2024-05-21)
2
14
 
3
15
  - Updated Tokenizers to 0.19.1
data/Cargo.lock CHANGED
@@ -4,19 +4,13 @@ version = 3
4
4
 
5
5
  [[package]]
6
6
  name = "aho-corasick"
7
- version = "1.1.1"
7
+ version = "1.1.3"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
- checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
9
+ checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
10
10
  dependencies = [
11
11
  "memchr",
12
12
  ]
13
13
 
14
- [[package]]
15
- name = "autocfg"
16
- version = "1.1.0"
17
- source = "registry+https://github.com/rust-lang/crates.io-index"
18
- checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
19
-
20
14
  [[package]]
21
15
  name = "base64"
22
16
  version = "0.13.1"
@@ -25,16 +19,16 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
25
19
 
26
20
  [[package]]
27
21
  name = "bindgen"
28
- version = "0.69.1"
22
+ version = "0.69.4"
29
23
  source = "registry+https://github.com/rust-lang/crates.io-index"
30
- checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
24
+ checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
31
25
  dependencies = [
32
- "bitflags 2.4.1",
26
+ "bitflags 2.6.0",
33
27
  "cexpr",
34
28
  "clang-sys",
29
+ "itertools 0.12.1",
35
30
  "lazy_static",
36
31
  "lazycell",
37
- "peeking_take_while",
38
32
  "proc-macro2",
39
33
  "quote",
40
34
  "regex",
@@ -51,15 +45,24 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
51
45
 
52
46
  [[package]]
53
47
  name = "bitflags"
54
- version = "2.4.1"
48
+ version = "2.6.0"
55
49
  source = "registry+https://github.com/rust-lang/crates.io-index"
56
- checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
50
+ checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
51
+
52
+ [[package]]
53
+ name = "byteorder"
54
+ version = "1.5.0"
55
+ source = "registry+https://github.com/rust-lang/crates.io-index"
56
+ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
57
57
 
58
58
  [[package]]
59
59
  name = "cc"
60
- version = "1.0.79"
60
+ version = "1.1.15"
61
61
  source = "registry+https://github.com/rust-lang/crates.io-index"
62
- checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
62
+ checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
63
+ dependencies = [
64
+ "shlex",
65
+ ]
63
66
 
64
67
  [[package]]
65
68
  name = "cexpr"
@@ -78,9 +81,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
78
81
 
79
82
  [[package]]
80
83
  name = "clang-sys"
81
- version = "1.6.1"
84
+ version = "1.8.1"
82
85
  source = "registry+https://github.com/rust-lang/crates.io-index"
83
- checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
86
+ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
84
87
  dependencies = [
85
88
  "glob",
86
89
  "libc",
@@ -89,9 +92,9 @@ dependencies = [
89
92
 
90
93
  [[package]]
91
94
  name = "console"
92
- version = "0.15.5"
95
+ version = "0.15.8"
93
96
  source = "registry+https://github.com/rust-lang/crates.io-index"
94
- checksum = "c3d79fbe8970a77e3e34151cc13d3b3e248aa0faaecb9f6091fa07ebefe5ad60"
97
+ checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
95
98
  dependencies = [
96
99
  "encode_unicode",
97
100
  "lazy_static",
@@ -102,42 +105,34 @@ dependencies = [
102
105
 
103
106
  [[package]]
104
107
  name = "crossbeam-deque"
105
- version = "0.8.3"
108
+ version = "0.8.5"
106
109
  source = "registry+https://github.com/rust-lang/crates.io-index"
107
- checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
110
+ checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
108
111
  dependencies = [
109
- "cfg-if",
110
112
  "crossbeam-epoch",
111
113
  "crossbeam-utils",
112
114
  ]
113
115
 
114
116
  [[package]]
115
117
  name = "crossbeam-epoch"
116
- version = "0.9.14"
118
+ version = "0.9.18"
117
119
  source = "registry+https://github.com/rust-lang/crates.io-index"
118
- checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
120
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
119
121
  dependencies = [
120
- "autocfg",
121
- "cfg-if",
122
122
  "crossbeam-utils",
123
- "memoffset",
124
- "scopeguard",
125
123
  ]
126
124
 
127
125
  [[package]]
128
126
  name = "crossbeam-utils"
129
- version = "0.8.15"
127
+ version = "0.8.20"
130
128
  source = "registry+https://github.com/rust-lang/crates.io-index"
131
- checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
132
- dependencies = [
133
- "cfg-if",
134
- ]
129
+ checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
135
130
 
136
131
  [[package]]
137
132
  name = "darling"
138
- version = "0.20.8"
133
+ version = "0.20.10"
139
134
  source = "registry+https://github.com/rust-lang/crates.io-index"
140
- checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
135
+ checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
141
136
  dependencies = [
142
137
  "darling_core",
143
138
  "darling_macro",
@@ -145,9 +140,9 @@ dependencies = [
145
140
 
146
141
  [[package]]
147
142
  name = "darling_core"
148
- version = "0.20.8"
143
+ version = "0.20.10"
149
144
  source = "registry+https://github.com/rust-lang/crates.io-index"
150
- checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
145
+ checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
151
146
  dependencies = [
152
147
  "fnv",
153
148
  "ident_case",
@@ -159,9 +154,9 @@ dependencies = [
159
154
 
160
155
  [[package]]
161
156
  name = "darling_macro"
162
- version = "0.20.8"
157
+ version = "0.20.10"
163
158
  source = "registry+https://github.com/rust-lang/crates.io-index"
164
- checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
159
+ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
165
160
  dependencies = [
166
161
  "darling_core",
167
162
  "quote",
@@ -201,9 +196,9 @@ dependencies = [
201
196
 
202
197
  [[package]]
203
198
  name = "either"
204
- version = "1.8.1"
199
+ version = "1.13.0"
205
200
  source = "registry+https://github.com/rust-lang/crates.io-index"
206
- checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
201
+ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
207
202
 
208
203
  [[package]]
209
204
  name = "encode_unicode"
@@ -228,9 +223,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
228
223
 
229
224
  [[package]]
230
225
  name = "getrandom"
231
- version = "0.2.10"
226
+ version = "0.2.15"
232
227
  source = "registry+https://github.com/rust-lang/crates.io-index"
233
- checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
228
+ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
234
229
  dependencies = [
235
230
  "cfg-if",
236
231
  "libc",
@@ -251,9 +246,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
251
246
 
252
247
  [[package]]
253
248
  name = "indicatif"
254
- version = "0.17.7"
249
+ version = "0.17.8"
255
250
  source = "registry+https://github.com/rust-lang/crates.io-index"
256
- checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25"
251
+ checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
257
252
  dependencies = [
258
253
  "console",
259
254
  "instant",
@@ -264,9 +259,9 @@ dependencies = [
264
259
 
265
260
  [[package]]
266
261
  name = "instant"
267
- version = "0.1.12"
262
+ version = "0.1.13"
268
263
  source = "registry+https://github.com/rust-lang/crates.io-index"
269
- checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
264
+ checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
270
265
  dependencies = [
271
266
  "cfg-if",
272
267
  ]
@@ -291,15 +286,15 @@ dependencies = [
291
286
 
292
287
  [[package]]
293
288
  name = "itoa"
294
- version = "1.0.6"
289
+ version = "1.0.11"
295
290
  source = "registry+https://github.com/rust-lang/crates.io-index"
296
- checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
291
+ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
297
292
 
298
293
  [[package]]
299
294
  name = "lazy_static"
300
- version = "1.4.0"
295
+ version = "1.5.0"
301
296
  source = "registry+https://github.com/rust-lang/crates.io-index"
302
- checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
297
+ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
303
298
 
304
299
  [[package]]
305
300
  name = "lazycell"
@@ -309,28 +304,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
309
304
 
310
305
  [[package]]
311
306
  name = "libc"
312
- version = "0.2.149"
307
+ version = "0.2.158"
313
308
  source = "registry+https://github.com/rust-lang/crates.io-index"
314
- checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
309
+ checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
315
310
 
316
311
  [[package]]
317
312
  name = "libloading"
318
- version = "0.7.4"
313
+ version = "0.8.5"
319
314
  source = "registry+https://github.com/rust-lang/crates.io-index"
320
- checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f"
315
+ checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
321
316
  dependencies = [
322
317
  "cfg-if",
323
- "winapi",
318
+ "windows-targets",
324
319
  ]
325
320
 
326
321
  [[package]]
327
322
  name = "log"
328
- version = "0.4.17"
323
+ version = "0.4.22"
329
324
  source = "registry+https://github.com/rust-lang/crates.io-index"
330
- checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
331
- dependencies = [
332
- "cfg-if",
333
- ]
325
+ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
334
326
 
335
327
  [[package]]
336
328
  name = "macro_rules_attribute"
@@ -350,9 +342,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
350
342
 
351
343
  [[package]]
352
344
  name = "magnus"
353
- version = "0.6.4"
345
+ version = "0.7.1"
354
346
  source = "registry+https://github.com/rust-lang/crates.io-index"
355
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
347
+ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
356
348
  dependencies = [
357
349
  "magnus-macros",
358
350
  "rb-sys",
@@ -373,18 +365,9 @@ dependencies = [
373
365
 
374
366
  [[package]]
375
367
  name = "memchr"
376
- version = "2.6.3"
377
- source = "registry+https://github.com/rust-lang/crates.io-index"
378
- checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
379
-
380
- [[package]]
381
- name = "memoffset"
382
- version = "0.8.0"
368
+ version = "2.7.4"
383
369
  source = "registry+https://github.com/rust-lang/crates.io-index"
384
- checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
385
- dependencies = [
386
- "autocfg",
387
- ]
370
+ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
388
371
 
389
372
  [[package]]
390
373
  name = "minimal-lexical"
@@ -394,9 +377,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
394
377
 
395
378
  [[package]]
396
379
  name = "monostate"
397
- version = "0.1.12"
380
+ version = "0.1.13"
398
381
  source = "registry+https://github.com/rust-lang/crates.io-index"
399
- checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
382
+ checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
400
383
  dependencies = [
401
384
  "monostate-impl",
402
385
  "serde",
@@ -404,9 +387,9 @@ dependencies = [
404
387
 
405
388
  [[package]]
406
389
  name = "monostate-impl"
407
- version = "0.1.12"
390
+ version = "0.1.13"
408
391
  source = "registry+https://github.com/rust-lang/crates.io-index"
409
- checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
392
+ checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
410
393
  dependencies = [
411
394
  "proc-macro2",
412
395
  "quote",
@@ -431,9 +414,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
431
414
 
432
415
  [[package]]
433
416
  name = "once_cell"
434
- version = "1.17.1"
417
+ version = "1.19.0"
435
418
  source = "registry+https://github.com/rust-lang/crates.io-index"
436
- checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
419
+ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
437
420
 
438
421
  [[package]]
439
422
  name = "onig"
@@ -459,48 +442,45 @@ dependencies = [
459
442
 
460
443
  [[package]]
461
444
  name = "paste"
462
- version = "1.0.14"
445
+ version = "1.0.15"
463
446
  source = "registry+https://github.com/rust-lang/crates.io-index"
464
- checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
465
-
466
- [[package]]
467
- name = "peeking_take_while"
468
- version = "0.1.2"
469
- source = "registry+https://github.com/rust-lang/crates.io-index"
470
- checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
447
+ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
471
448
 
472
449
  [[package]]
473
450
  name = "pkg-config"
474
- version = "0.3.26"
451
+ version = "0.3.30"
475
452
  source = "registry+https://github.com/rust-lang/crates.io-index"
476
- checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
453
+ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
477
454
 
478
455
  [[package]]
479
456
  name = "portable-atomic"
480
- version = "1.4.3"
457
+ version = "1.7.0"
481
458
  source = "registry+https://github.com/rust-lang/crates.io-index"
482
- checksum = "31114a898e107c51bb1609ffaf55a0e011cf6a4d7f1170d0015a165082c0338b"
459
+ checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
483
460
 
484
461
  [[package]]
485
462
  name = "ppv-lite86"
486
- version = "0.2.17"
463
+ version = "0.2.20"
487
464
  source = "registry+https://github.com/rust-lang/crates.io-index"
488
- checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
465
+ checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
466
+ dependencies = [
467
+ "zerocopy",
468
+ ]
489
469
 
490
470
  [[package]]
491
471
  name = "proc-macro2"
492
- version = "1.0.81"
472
+ version = "1.0.86"
493
473
  source = "registry+https://github.com/rust-lang/crates.io-index"
494
- checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
474
+ checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
495
475
  dependencies = [
496
476
  "unicode-ident",
497
477
  ]
498
478
 
499
479
  [[package]]
500
480
  name = "quote"
501
- version = "1.0.36"
481
+ version = "1.0.37"
502
482
  source = "registry+https://github.com/rust-lang/crates.io-index"
503
- checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
483
+ checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
504
484
  dependencies = [
505
485
  "proc-macro2",
506
486
  ]
@@ -568,18 +548,18 @@ dependencies = [
568
548
 
569
549
  [[package]]
570
550
  name = "rb-sys"
571
- version = "0.9.97"
551
+ version = "0.9.102"
572
552
  source = "registry+https://github.com/rust-lang/crates.io-index"
573
- checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
553
+ checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
574
554
  dependencies = [
575
555
  "rb-sys-build",
576
556
  ]
577
557
 
578
558
  [[package]]
579
559
  name = "rb-sys-build"
580
- version = "0.9.97"
560
+ version = "0.9.102"
581
561
  source = "registry+https://github.com/rust-lang/crates.io-index"
582
- checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
562
+ checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
583
563
  dependencies = [
584
564
  "bindgen",
585
565
  "lazy_static",
@@ -598,9 +578,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
598
578
 
599
579
  [[package]]
600
580
  name = "regex"
601
- version = "1.10.4"
581
+ version = "1.10.6"
602
582
  source = "registry+https://github.com/rust-lang/crates.io-index"
603
- checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
583
+ checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
604
584
  dependencies = [
605
585
  "aho-corasick",
606
586
  "memchr",
@@ -610,9 +590,9 @@ dependencies = [
610
590
 
611
591
  [[package]]
612
592
  name = "regex-automata"
613
- version = "0.4.6"
593
+ version = "0.4.7"
614
594
  source = "registry+https://github.com/rust-lang/crates.io-index"
615
- checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
595
+ checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
616
596
  dependencies = [
617
597
  "aho-corasick",
618
598
  "memchr",
@@ -621,9 +601,9 @@ dependencies = [
621
601
 
622
602
  [[package]]
623
603
  name = "regex-syntax"
624
- version = "0.8.2"
604
+ version = "0.8.4"
625
605
  source = "registry+https://github.com/rust-lang/crates.io-index"
626
- checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
606
+ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
627
607
 
628
608
  [[package]]
629
609
  name = "rustc-hash"
@@ -633,15 +613,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
633
613
 
634
614
  [[package]]
635
615
  name = "ryu"
636
- version = "1.0.13"
637
- source = "registry+https://github.com/rust-lang/crates.io-index"
638
- checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
639
-
640
- [[package]]
641
- name = "scopeguard"
642
- version = "1.1.0"
616
+ version = "1.0.18"
643
617
  source = "registry+https://github.com/rust-lang/crates.io-index"
644
- checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
618
+ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
645
619
 
646
620
  [[package]]
647
621
  name = "seq-macro"
@@ -651,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
651
625
 
652
626
  [[package]]
653
627
  name = "serde"
654
- version = "1.0.188"
628
+ version = "1.0.209"
655
629
  source = "registry+https://github.com/rust-lang/crates.io-index"
656
- checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
630
+ checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
657
631
  dependencies = [
658
632
  "serde_derive",
659
633
  ]
660
634
 
661
635
  [[package]]
662
636
  name = "serde_derive"
663
- version = "1.0.188"
637
+ version = "1.0.209"
664
638
  source = "registry+https://github.com/rust-lang/crates.io-index"
665
- checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
639
+ checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
666
640
  dependencies = [
667
641
  "proc-macro2",
668
642
  "quote",
@@ -671,11 +645,12 @@ dependencies = [
671
645
 
672
646
  [[package]]
673
647
  name = "serde_json"
674
- version = "1.0.95"
648
+ version = "1.0.127"
675
649
  source = "registry+https://github.com/rust-lang/crates.io-index"
676
- checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
650
+ checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
677
651
  dependencies = [
678
652
  "itoa",
653
+ "memchr",
679
654
  "ryu",
680
655
  "serde",
681
656
  ]
@@ -688,15 +663,15 @@ checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
688
663
 
689
664
  [[package]]
690
665
  name = "shlex"
691
- version = "1.1.0"
666
+ version = "1.3.0"
692
667
  source = "registry+https://github.com/rust-lang/crates.io-index"
693
- checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
668
+ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
694
669
 
695
670
  [[package]]
696
671
  name = "smallvec"
697
- version = "1.10.0"
672
+ version = "1.13.2"
698
673
  source = "registry+https://github.com/rust-lang/crates.io-index"
699
- checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
674
+ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
700
675
 
701
676
  [[package]]
702
677
  name = "spm_precompiled"
@@ -712,15 +687,15 @@ dependencies = [
712
687
 
713
688
  [[package]]
714
689
  name = "strsim"
715
- version = "0.10.0"
690
+ version = "0.11.1"
716
691
  source = "registry+https://github.com/rust-lang/crates.io-index"
717
- checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
692
+ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
718
693
 
719
694
  [[package]]
720
695
  name = "syn"
721
- version = "2.0.59"
696
+ version = "2.0.76"
722
697
  source = "registry+https://github.com/rust-lang/crates.io-index"
723
- checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
698
+ checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
724
699
  dependencies = [
725
700
  "proc-macro2",
726
701
  "quote",
@@ -729,18 +704,18 @@ dependencies = [
729
704
 
730
705
  [[package]]
731
706
  name = "thiserror"
732
- version = "1.0.49"
707
+ version = "1.0.63"
733
708
  source = "registry+https://github.com/rust-lang/crates.io-index"
734
- checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4"
709
+ checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
735
710
  dependencies = [
736
711
  "thiserror-impl",
737
712
  ]
738
713
 
739
714
  [[package]]
740
715
  name = "thiserror-impl"
741
- version = "1.0.49"
716
+ version = "1.0.63"
742
717
  source = "registry+https://github.com/rust-lang/crates.io-index"
743
- checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
718
+ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
744
719
  dependencies = [
745
720
  "proc-macro2",
746
721
  "quote",
@@ -749,19 +724,19 @@ dependencies = [
749
724
 
750
725
  [[package]]
751
726
  name = "tokenizers"
752
- version = "0.5.0"
727
+ version = "0.5.2"
753
728
  dependencies = [
754
729
  "magnus",
755
730
  "onig",
756
731
  "serde",
757
- "tokenizers 0.19.1",
732
+ "tokenizers 0.20.0",
758
733
  ]
759
734
 
760
735
  [[package]]
761
736
  name = "tokenizers"
762
- version = "0.19.1"
737
+ version = "0.20.0"
763
738
  source = "registry+https://github.com/rust-lang/crates.io-index"
764
- checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
739
+ checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
765
740
  dependencies = [
766
741
  "aho-corasick",
767
742
  "derive_builder",
@@ -791,9 +766,9 @@ dependencies = [
791
766
 
792
767
  [[package]]
793
768
  name = "unicode-ident"
794
- version = "1.0.8"
769
+ version = "1.0.12"
795
770
  source = "registry+https://github.com/rust-lang/crates.io-index"
796
- checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
771
+ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
797
772
 
798
773
  [[package]]
799
774
  name = "unicode-normalization-alignments"
@@ -812,9 +787,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
812
787
 
813
788
  [[package]]
814
789
  name = "unicode-width"
815
- version = "0.1.10"
790
+ version = "0.1.13"
816
791
  source = "registry+https://github.com/rust-lang/crates.io-index"
817
- checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
792
+ checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
818
793
 
819
794
  [[package]]
820
795
  name = "unicode_categories"
@@ -829,36 +804,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
829
804
  checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
830
805
 
831
806
  [[package]]
832
- name = "winapi"
833
- version = "0.3.9"
807
+ name = "windows-sys"
808
+ version = "0.52.0"
834
809
  source = "registry+https://github.com/rust-lang/crates.io-index"
835
- checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
810
+ checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
836
811
  dependencies = [
837
- "winapi-i686-pc-windows-gnu",
838
- "winapi-x86_64-pc-windows-gnu",
812
+ "windows-targets",
839
813
  ]
840
814
 
841
815
  [[package]]
842
- name = "winapi-i686-pc-windows-gnu"
843
- version = "0.4.0"
816
+ name = "windows-targets"
817
+ version = "0.52.6"
844
818
  source = "registry+https://github.com/rust-lang/crates.io-index"
845
- checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
846
-
847
- [[package]]
848
- name = "winapi-x86_64-pc-windows-gnu"
849
- version = "0.4.0"
850
- source = "registry+https://github.com/rust-lang/crates.io-index"
851
- checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
852
-
853
- [[package]]
854
- name = "windows-sys"
855
- version = "0.42.0"
856
- source = "registry+https://github.com/rust-lang/crates.io-index"
857
- checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
819
+ checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
858
820
  dependencies = [
859
821
  "windows_aarch64_gnullvm",
860
822
  "windows_aarch64_msvc",
861
823
  "windows_i686_gnu",
824
+ "windows_i686_gnullvm",
862
825
  "windows_i686_msvc",
863
826
  "windows_x86_64_gnu",
864
827
  "windows_x86_64_gnullvm",
@@ -867,42 +830,69 @@ dependencies = [
867
830
 
868
831
  [[package]]
869
832
  name = "windows_aarch64_gnullvm"
870
- version = "0.42.2"
833
+ version = "0.52.6"
871
834
  source = "registry+https://github.com/rust-lang/crates.io-index"
872
- checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
835
+ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
873
836
 
874
837
  [[package]]
875
838
  name = "windows_aarch64_msvc"
876
- version = "0.42.2"
839
+ version = "0.52.6"
877
840
  source = "registry+https://github.com/rust-lang/crates.io-index"
878
- checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
841
+ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
879
842
 
880
843
  [[package]]
881
844
  name = "windows_i686_gnu"
882
- version = "0.42.2"
845
+ version = "0.52.6"
846
+ source = "registry+https://github.com/rust-lang/crates.io-index"
847
+ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
848
+
849
+ [[package]]
850
+ name = "windows_i686_gnullvm"
851
+ version = "0.52.6"
883
852
  source = "registry+https://github.com/rust-lang/crates.io-index"
884
- checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
853
+ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
885
854
 
886
855
  [[package]]
887
856
  name = "windows_i686_msvc"
888
- version = "0.42.2"
857
+ version = "0.52.6"
889
858
  source = "registry+https://github.com/rust-lang/crates.io-index"
890
- checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
859
+ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
891
860
 
892
861
  [[package]]
893
862
  name = "windows_x86_64_gnu"
894
- version = "0.42.2"
863
+ version = "0.52.6"
895
864
  source = "registry+https://github.com/rust-lang/crates.io-index"
896
- checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
865
+ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
897
866
 
898
867
  [[package]]
899
868
  name = "windows_x86_64_gnullvm"
900
- version = "0.42.2"
869
+ version = "0.52.6"
901
870
  source = "registry+https://github.com/rust-lang/crates.io-index"
902
- checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
871
+ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
903
872
 
904
873
  [[package]]
905
874
  name = "windows_x86_64_msvc"
906
- version = "0.42.2"
875
+ version = "0.52.6"
876
+ source = "registry+https://github.com/rust-lang/crates.io-index"
877
+ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
878
+
879
+ [[package]]
880
+ name = "zerocopy"
881
+ version = "0.7.35"
882
+ source = "registry+https://github.com/rust-lang/crates.io-index"
883
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
884
+ dependencies = [
885
+ "byteorder",
886
+ "zerocopy-derive",
887
+ ]
888
+
889
+ [[package]]
890
+ name = "zerocopy-derive"
891
+ version = "0.7.35"
907
892
  source = "registry+https://github.com/rust-lang/crates.io-index"
908
- checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
893
+ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
894
+ dependencies = [
895
+ "proc-macro2",
896
+ "quote",
897
+ "syn",
898
+ ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.5.0"
3
+ version = "0.5.2"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -11,11 +11,11 @@ publish = false
11
11
  crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
- magnus = "0.6"
14
+ magnus = "0.7"
15
15
  onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.19.1" # also update in from_pretrained.rb
19
+ version = "=0.20.0" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
34
34
  }
35
35
  }
36
36
 
37
+ impl RbDecoder {
38
+ pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
39
+ self.decoder.decode(tokens).map_err(RbError::from)
40
+ }
41
+ }
42
+
37
43
  macro_rules! getter {
38
44
  ($self: ident, $variant: ident, $($name: tt)+) => {{
39
45
  let decoder = &$self.decoder;
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
358
364
 
359
365
  pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
360
366
  let decoder = module.define_class("Decoder", ruby.class_object())?;
367
+ decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
361
368
 
362
369
  let class = module.define_class("BPEDecoder", decoder)?;
363
370
  class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
42
42
 
43
43
  let class = module.define_class("Tokenizer", ruby.class_object())?;
44
44
  class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
45
+ class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
45
46
  class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
46
47
  class.define_method(
47
48
  "add_special_tokens",
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
54
55
  class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
55
56
  class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
56
57
  class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
58
+ class.define_method("model", method!(RbTokenizer::get_model, 0))?;
59
+ class.define_method("model=", method!(RbTokenizer::set_model,1))?;
60
+ class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
57
61
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
62
+ class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
58
63
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
64
+ class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
59
65
  class.define_method(
60
66
  "post_processor=",
61
67
  method!(RbTokenizer::set_post_processor, 1),
62
68
  )?;
69
+ class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
63
70
  class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
64
71
  class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
65
72
  class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
@@ -222,8 +222,8 @@ pub struct RbSequence {}
222
222
  impl RbSequence {
223
223
  fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
224
224
  let mut sequence = Vec::with_capacity(normalizers.len());
225
- for n in normalizers.each() {
226
- let normalizer: &RbNormalizer = TryConvert::try_convert(n?)?;
225
+ for n in normalizers.into_iter() {
226
+ let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
227
227
  match &normalizer.normalizer {
228
228
  RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
229
229
  RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
@@ -258,8 +258,8 @@ pub struct RbSequence {}
258
258
  impl RbSequence {
259
259
  fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
260
260
  let mut sequence = Vec::with_capacity(pre_tokenizers.len());
261
- for n in pre_tokenizers.each() {
262
- let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n?)?;
261
+ for n in pre_tokenizers.into_iter() {
262
+ let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
263
263
  match &pretokenizer.pretok {
264
264
  RbPreTokenizerTypeWrapper::Sequence(inner) => {
265
265
  sequence.extend(inner.iter().cloned())
@@ -1,9 +1,10 @@
1
1
  use std::cell::RefCell;
2
2
  use std::collections::HashMap;
3
3
  use std::path::PathBuf;
4
+ use std::str::FromStr;
4
5
 
5
6
  use magnus::prelude::*;
6
- use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
7
+ use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
7
8
  use tk::tokenizer::{
8
9
  Model, PaddingDirection, PaddingParams, PaddingStrategy,
9
10
  TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
@@ -203,6 +204,14 @@ impl RbTokenizer {
203
204
  RbTokenizer::new(TokenizerImpl::new(model.clone()))
204
205
  }
205
206
 
207
+ pub fn from_str(json: RString) -> RbResult<Self> {
208
+ Tokenizer::from_str(unsafe { json.as_str()? })
209
+ .map(|v| RbTokenizer {
210
+ tokenizer: RefCell::new(v),
211
+ })
212
+ .map_err(RbError::from)
213
+ }
214
+
206
215
  pub fn from_file(path: PathBuf) -> RbResult<Self> {
207
216
  Tokenizer::from_file(path)
208
217
  .map(|v| RbTokenizer {
@@ -282,12 +291,12 @@ impl RbTokenizer {
282
291
  add_special_tokens: bool,
283
292
  ) -> RbResult<RArray> {
284
293
  let input: Vec<tk::EncodeInput> = input
285
- .each()
294
+ .into_iter()
286
295
  .map(|o| {
287
296
  let input: tk::EncodeInput = if is_pretokenized {
288
- PreTokenizedEncodeInput::try_convert(o?)?.into()
297
+ PreTokenizedEncodeInput::try_convert(o)?.into()
289
298
  } else {
290
- TextEncodeInput::try_convert(o?)?.into()
299
+ TextEncodeInput::try_convert(o)?.into()
291
300
  };
292
301
  Ok(input)
293
302
  })
@@ -319,26 +328,50 @@ impl RbTokenizer {
319
328
  .map_err(RbError::from)
320
329
  }
321
330
 
322
- pub fn set_decoder(&self, decoder: &RbDecoder) {
323
- self.tokenizer.borrow_mut().with_decoder(decoder.clone());
331
+ pub fn get_model(&self) -> RbModel {
332
+ self.tokenizer.borrow().get_model().clone()
333
+ }
334
+
335
+ pub fn set_model(&self, model: &RbModel) {
336
+ self.tokenizer.borrow_mut().with_model(model.clone());
337
+ }
338
+
339
+ pub fn get_decoder(&self) -> Option<RbDecoder> {
340
+ self.tokenizer.borrow().get_decoder().cloned()
341
+ }
342
+
343
+ pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
344
+ self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
345
+ }
346
+
347
+ pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
348
+ self.tokenizer.borrow().get_pre_tokenizer().cloned()
324
349
  }
325
350
 
326
- pub fn set_pre_tokenizer(&self, pretok: &RbPreTokenizer) {
351
+ pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
327
352
  self.tokenizer
328
353
  .borrow_mut()
329
- .with_pre_tokenizer(pretok.clone());
354
+ .with_pre_tokenizer(pretok.cloned());
330
355
  }
331
356
 
332
- pub fn set_post_processor(&self, processor: &RbPostProcessor) {
357
+ pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
358
+ self.tokenizer.borrow().get_post_processor().cloned()
359
+ }
360
+
361
+ pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
333
362
  self.tokenizer
334
363
  .borrow_mut()
335
- .with_post_processor(processor.clone());
364
+ .with_post_processor(processor.cloned());
365
+ }
366
+
367
+ pub fn get_normalizer(&self) -> Option<RbNormalizer> {
368
+ self.tokenizer.borrow().get_normalizer().cloned()
336
369
  }
337
370
 
338
- pub fn set_normalizer(&self, normalizer: &RbNormalizer) {
371
+ pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
339
372
  self.tokenizer
340
373
  .borrow_mut()
341
- .with_normalizer(normalizer.clone());
374
+ .with_normalizer(normalizer.cloned());
342
375
  }
343
376
 
344
377
  pub fn token_to_id(&self, token: String) -> Option<u32> {
@@ -110,9 +110,9 @@ impl RbTrainer {
110
110
  BpeTrainer,
111
111
  special_tokens,
112
112
  special_tokens
113
- .each()
113
+ .into_iter()
114
114
  .map(|token| {
115
- if let Ok(content) = String::try_convert(token?) {
115
+ if let Ok(content) = String::try_convert(token) {
116
116
  Ok(RbAddedToken::from(content, Some(true)).get_token())
117
117
  } else {
118
118
  todo!()
@@ -197,9 +197,9 @@ impl RbTrainer {
197
197
  UnigramTrainer,
198
198
  special_tokens,
199
199
  special_tokens
200
- .each()
200
+ .into_iter()
201
201
  .map(|token| {
202
- if let Ok(content) = String::try_convert(token?) {
202
+ if let Ok(content) = String::try_convert(token) {
203
203
  Ok(RbAddedToken::from(content, Some(true)).get_token())
204
204
  } else {
205
205
  todo!()
@@ -268,9 +268,9 @@ impl RbTrainer {
268
268
  WordLevelTrainer,
269
269
  special_tokens,
270
270
  special_tokens
271
- .each()
271
+ .into_iter()
272
272
  .map(|token| {
273
- if let Ok(content) = String::try_convert(token?) {
273
+ if let Ok(content) = String::try_convert(token) {
274
274
  Ok(RbAddedToken::from(content, Some(true)).get_token())
275
275
  } else {
276
276
  todo!()
@@ -322,9 +322,9 @@ impl RbTrainer {
322
322
  WordPieceTrainer,
323
323
  @set_special_tokens,
324
324
  special_tokens
325
- .each()
325
+ .into_iter()
326
326
  .map(|token| {
327
- if let Ok(content) = String::try_convert(token?) {
327
+ if let Ok(content) = String::try_convert(token) {
328
328
  Ok(RbAddedToken::from(content, Some(true)).get_token())
329
329
  } else {
330
330
  todo!()
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
398
398
  if !value.is_nil() {
399
399
  builder = builder.special_tokens(
400
400
  RArray::try_convert(value)?
401
- .each()
401
+ .into_iter()
402
402
  .map(|token| {
403
- if let Ok(content) = String::try_convert(token?) {
403
+ if let Ok(content) = String::try_convert(token) {
404
404
  Ok(RbAddedToken::from(content, Some(true)).get_token())
405
405
  } else {
406
406
  todo!()
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
466
466
  if !value.is_nil() {
467
467
  builder.special_tokens(
468
468
  RArray::try_convert(value)?
469
- .each()
469
+ .into_iter()
470
470
  .map(|token| {
471
- if let Ok(content) = String::try_convert(token?) {
471
+ if let Ok(content) = String::try_convert(token) {
472
472
  Ok(RbAddedToken::from(content, Some(true)).get_token())
473
473
  } else {
474
474
  todo!()
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
540
540
  if !value.is_nil() {
541
541
  builder.special_tokens(
542
542
  RArray::try_convert(value)?
543
- .each()
543
+ .into_iter()
544
544
  .map(|token| {
545
- if let Ok(content) = String::try_convert(token?) {
545
+ if let Ok(content) = String::try_convert(token) {
546
546
  Ok(RbAddedToken::from(content, Some(true)).get_token())
547
547
  } else {
548
548
  todo!()
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
581
581
  if !value.is_nil() {
582
582
  builder = builder.special_tokens(
583
583
  RArray::try_convert(value)?
584
- .each()
584
+ .into_iter()
585
585
  .map(|token| {
586
- if let Ok(content) = String::try_convert(token?) {
586
+ if let Ok(content) = String::try_convert(token) {
587
587
  Ok(RbAddedToken::from(content, Some(true)).get_token())
588
588
  } else {
589
589
  todo!()
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.19.1"
4
+ TOKENIZERS_VERSION = "0.20.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -67,7 +67,7 @@ module Tokenizers
67
67
  end
68
68
  end
69
69
 
70
- options[:content_length_proc] = -> (_) { puts "Downloading..." }
70
+ options[:content_length_proc] = ->(_) { puts "Downloading..." }
71
71
 
72
72
  # string options are headers
73
73
  tempfile = URI.parse(url).open(headers.merge(options))
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-05-21 00:00:00.000000000 Z
11
+ date: 2024-08-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.5.9
103
+ rubygems_version: 3.5.11
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby