tokenizers 0.5.0 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 258211e71ca06e96bb4ee01b15e29f6f74d3c70d04af246e95b178e10f093059
4
- data.tar.gz: 6e0b01c577830afdf1c7d677b1377191420d85e0f1f8638893f72cbb7ccef322
3
+ metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
4
+ data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
5
5
  SHA512:
6
- metadata.gz: 4e0ea1f11dbab96b213190397ee8676d6233568f4fe013970a5a2c32105ed20ec06a5c8bc7379065799de315a0fc6d5f47807f9af47bc6f47926e4147c3eabcc
7
- data.tar.gz: ccd00b103577c6cff4dded6a3bc42394eccb3e24b950674a33eedf76df7c08bc89cda8219f076fce4cf20d90580da82c03e001a4e49ceb80e56ae4055b4617cf
6
+ metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
7
+ data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 0.5.2 (2024-08-26)
2
+
3
+ - Added `from_str` method to `Tokenizer`
4
+ - Added `model` and `model=` methods to `Tokenizer`
5
+ - Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
6
+ - Added `decode` method to `Decoder`
7
+
8
+ ## 0.5.1 (2024-08-13)
9
+
10
+ - Updated Tokenizers to 0.20.0
11
+ - Added precompiled gem for Linux ARM MUSL
12
+
1
13
  ## 0.5.0 (2024-05-21)
2
14
 
3
15
  - Updated Tokenizers to 0.19.1
data/Cargo.lock CHANGED
@@ -4,19 +4,13 @@ version = 3
4
4
 
5
5
  [[package]]
6
6
  name = "aho-corasick"
7
- version = "1.1.1"
7
+ version = "1.1.3"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
- checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
9
+ checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
10
10
  dependencies = [
11
11
  "memchr",
12
12
  ]
13
13
 
14
- [[package]]
15
- name = "autocfg"
16
- version = "1.1.0"
17
- source = "registry+https://github.com/rust-lang/crates.io-index"
18
- checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
19
-
20
14
  [[package]]
21
15
  name = "base64"
22
16
  version = "0.13.1"
@@ -25,16 +19,16 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
25
19
 
26
20
  [[package]]
27
21
  name = "bindgen"
28
- version = "0.69.1"
22
+ version = "0.69.4"
29
23
  source = "registry+https://github.com/rust-lang/crates.io-index"
30
- checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
24
+ checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
31
25
  dependencies = [
32
- "bitflags 2.4.1",
26
+ "bitflags 2.6.0",
33
27
  "cexpr",
34
28
  "clang-sys",
29
+ "itertools 0.12.1",
35
30
  "lazy_static",
36
31
  "lazycell",
37
- "peeking_take_while",
38
32
  "proc-macro2",
39
33
  "quote",
40
34
  "regex",
@@ -51,15 +45,24 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
51
45
 
52
46
  [[package]]
53
47
  name = "bitflags"
54
- version = "2.4.1"
48
+ version = "2.6.0"
55
49
  source = "registry+https://github.com/rust-lang/crates.io-index"
56
- checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
50
+ checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
51
+
52
+ [[package]]
53
+ name = "byteorder"
54
+ version = "1.5.0"
55
+ source = "registry+https://github.com/rust-lang/crates.io-index"
56
+ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
57
57
 
58
58
  [[package]]
59
59
  name = "cc"
60
- version = "1.0.79"
60
+ version = "1.1.15"
61
61
  source = "registry+https://github.com/rust-lang/crates.io-index"
62
- checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
62
+ checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
63
+ dependencies = [
64
+ "shlex",
65
+ ]
63
66
 
64
67
  [[package]]
65
68
  name = "cexpr"
@@ -78,9 +81,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
78
81
 
79
82
  [[package]]
80
83
  name = "clang-sys"
81
- version = "1.6.1"
84
+ version = "1.8.1"
82
85
  source = "registry+https://github.com/rust-lang/crates.io-index"
83
- checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
86
+ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
84
87
  dependencies = [
85
88
  "glob",
86
89
  "libc",
@@ -89,9 +92,9 @@ dependencies = [
89
92
 
90
93
  [[package]]
91
94
  name = "console"
92
- version = "0.15.5"
95
+ version = "0.15.8"
93
96
  source = "registry+https://github.com/rust-lang/crates.io-index"
94
- checksum = "c3d79fbe8970a77e3e34151cc13d3b3e248aa0faaecb9f6091fa07ebefe5ad60"
97
+ checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
95
98
  dependencies = [
96
99
  "encode_unicode",
97
100
  "lazy_static",
@@ -102,42 +105,34 @@ dependencies = [
102
105
 
103
106
  [[package]]
104
107
  name = "crossbeam-deque"
105
- version = "0.8.3"
108
+ version = "0.8.5"
106
109
  source = "registry+https://github.com/rust-lang/crates.io-index"
107
- checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
110
+ checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
108
111
  dependencies = [
109
- "cfg-if",
110
112
  "crossbeam-epoch",
111
113
  "crossbeam-utils",
112
114
  ]
113
115
 
114
116
  [[package]]
115
117
  name = "crossbeam-epoch"
116
- version = "0.9.14"
118
+ version = "0.9.18"
117
119
  source = "registry+https://github.com/rust-lang/crates.io-index"
118
- checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
120
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
119
121
  dependencies = [
120
- "autocfg",
121
- "cfg-if",
122
122
  "crossbeam-utils",
123
- "memoffset",
124
- "scopeguard",
125
123
  ]
126
124
 
127
125
  [[package]]
128
126
  name = "crossbeam-utils"
129
- version = "0.8.15"
127
+ version = "0.8.20"
130
128
  source = "registry+https://github.com/rust-lang/crates.io-index"
131
- checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
132
- dependencies = [
133
- "cfg-if",
134
- ]
129
+ checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
135
130
 
136
131
  [[package]]
137
132
  name = "darling"
138
- version = "0.20.8"
133
+ version = "0.20.10"
139
134
  source = "registry+https://github.com/rust-lang/crates.io-index"
140
- checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
135
+ checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
141
136
  dependencies = [
142
137
  "darling_core",
143
138
  "darling_macro",
@@ -145,9 +140,9 @@ dependencies = [
145
140
 
146
141
  [[package]]
147
142
  name = "darling_core"
148
- version = "0.20.8"
143
+ version = "0.20.10"
149
144
  source = "registry+https://github.com/rust-lang/crates.io-index"
150
- checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
145
+ checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
151
146
  dependencies = [
152
147
  "fnv",
153
148
  "ident_case",
@@ -159,9 +154,9 @@ dependencies = [
159
154
 
160
155
  [[package]]
161
156
  name = "darling_macro"
162
- version = "0.20.8"
157
+ version = "0.20.10"
163
158
  source = "registry+https://github.com/rust-lang/crates.io-index"
164
- checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
159
+ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
165
160
  dependencies = [
166
161
  "darling_core",
167
162
  "quote",
@@ -201,9 +196,9 @@ dependencies = [
201
196
 
202
197
  [[package]]
203
198
  name = "either"
204
- version = "1.8.1"
199
+ version = "1.13.0"
205
200
  source = "registry+https://github.com/rust-lang/crates.io-index"
206
- checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
201
+ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
207
202
 
208
203
  [[package]]
209
204
  name = "encode_unicode"
@@ -228,9 +223,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
228
223
 
229
224
  [[package]]
230
225
  name = "getrandom"
231
- version = "0.2.10"
226
+ version = "0.2.15"
232
227
  source = "registry+https://github.com/rust-lang/crates.io-index"
233
- checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
228
+ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
234
229
  dependencies = [
235
230
  "cfg-if",
236
231
  "libc",
@@ -251,9 +246,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
251
246
 
252
247
  [[package]]
253
248
  name = "indicatif"
254
- version = "0.17.7"
249
+ version = "0.17.8"
255
250
  source = "registry+https://github.com/rust-lang/crates.io-index"
256
- checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25"
251
+ checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
257
252
  dependencies = [
258
253
  "console",
259
254
  "instant",
@@ -264,9 +259,9 @@ dependencies = [
264
259
 
265
260
  [[package]]
266
261
  name = "instant"
267
- version = "0.1.12"
262
+ version = "0.1.13"
268
263
  source = "registry+https://github.com/rust-lang/crates.io-index"
269
- checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
264
+ checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
270
265
  dependencies = [
271
266
  "cfg-if",
272
267
  ]
@@ -291,15 +286,15 @@ dependencies = [
291
286
 
292
287
  [[package]]
293
288
  name = "itoa"
294
- version = "1.0.6"
289
+ version = "1.0.11"
295
290
  source = "registry+https://github.com/rust-lang/crates.io-index"
296
- checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
291
+ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
297
292
 
298
293
  [[package]]
299
294
  name = "lazy_static"
300
- version = "1.4.0"
295
+ version = "1.5.0"
301
296
  source = "registry+https://github.com/rust-lang/crates.io-index"
302
- checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
297
+ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
303
298
 
304
299
  [[package]]
305
300
  name = "lazycell"
@@ -309,28 +304,25 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
309
304
 
310
305
  [[package]]
311
306
  name = "libc"
312
- version = "0.2.149"
307
+ version = "0.2.158"
313
308
  source = "registry+https://github.com/rust-lang/crates.io-index"
314
- checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
309
+ checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
315
310
 
316
311
  [[package]]
317
312
  name = "libloading"
318
- version = "0.7.4"
313
+ version = "0.8.5"
319
314
  source = "registry+https://github.com/rust-lang/crates.io-index"
320
- checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f"
315
+ checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
321
316
  dependencies = [
322
317
  "cfg-if",
323
- "winapi",
318
+ "windows-targets",
324
319
  ]
325
320
 
326
321
  [[package]]
327
322
  name = "log"
328
- version = "0.4.17"
323
+ version = "0.4.22"
329
324
  source = "registry+https://github.com/rust-lang/crates.io-index"
330
- checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
331
- dependencies = [
332
- "cfg-if",
333
- ]
325
+ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
334
326
 
335
327
  [[package]]
336
328
  name = "macro_rules_attribute"
@@ -350,9 +342,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
350
342
 
351
343
  [[package]]
352
344
  name = "magnus"
353
- version = "0.6.4"
345
+ version = "0.7.1"
354
346
  source = "registry+https://github.com/rust-lang/crates.io-index"
355
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
347
+ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
356
348
  dependencies = [
357
349
  "magnus-macros",
358
350
  "rb-sys",
@@ -373,18 +365,9 @@ dependencies = [
373
365
 
374
366
  [[package]]
375
367
  name = "memchr"
376
- version = "2.6.3"
377
- source = "registry+https://github.com/rust-lang/crates.io-index"
378
- checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
379
-
380
- [[package]]
381
- name = "memoffset"
382
- version = "0.8.0"
368
+ version = "2.7.4"
383
369
  source = "registry+https://github.com/rust-lang/crates.io-index"
384
- checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
385
- dependencies = [
386
- "autocfg",
387
- ]
370
+ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
388
371
 
389
372
  [[package]]
390
373
  name = "minimal-lexical"
@@ -394,9 +377,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
394
377
 
395
378
  [[package]]
396
379
  name = "monostate"
397
- version = "0.1.12"
380
+ version = "0.1.13"
398
381
  source = "registry+https://github.com/rust-lang/crates.io-index"
399
- checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf"
382
+ checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
400
383
  dependencies = [
401
384
  "monostate-impl",
402
385
  "serde",
@@ -404,9 +387,9 @@ dependencies = [
404
387
 
405
388
  [[package]]
406
389
  name = "monostate-impl"
407
- version = "0.1.12"
390
+ version = "0.1.13"
408
391
  source = "registry+https://github.com/rust-lang/crates.io-index"
409
- checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6"
392
+ checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
410
393
  dependencies = [
411
394
  "proc-macro2",
412
395
  "quote",
@@ -431,9 +414,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
431
414
 
432
415
  [[package]]
433
416
  name = "once_cell"
434
- version = "1.17.1"
417
+ version = "1.19.0"
435
418
  source = "registry+https://github.com/rust-lang/crates.io-index"
436
- checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
419
+ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
437
420
 
438
421
  [[package]]
439
422
  name = "onig"
@@ -459,48 +442,45 @@ dependencies = [
459
442
 
460
443
  [[package]]
461
444
  name = "paste"
462
- version = "1.0.14"
445
+ version = "1.0.15"
463
446
  source = "registry+https://github.com/rust-lang/crates.io-index"
464
- checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
465
-
466
- [[package]]
467
- name = "peeking_take_while"
468
- version = "0.1.2"
469
- source = "registry+https://github.com/rust-lang/crates.io-index"
470
- checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
447
+ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
471
448
 
472
449
  [[package]]
473
450
  name = "pkg-config"
474
- version = "0.3.26"
451
+ version = "0.3.30"
475
452
  source = "registry+https://github.com/rust-lang/crates.io-index"
476
- checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
453
+ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
477
454
 
478
455
  [[package]]
479
456
  name = "portable-atomic"
480
- version = "1.4.3"
457
+ version = "1.7.0"
481
458
  source = "registry+https://github.com/rust-lang/crates.io-index"
482
- checksum = "31114a898e107c51bb1609ffaf55a0e011cf6a4d7f1170d0015a165082c0338b"
459
+ checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
483
460
 
484
461
  [[package]]
485
462
  name = "ppv-lite86"
486
- version = "0.2.17"
463
+ version = "0.2.20"
487
464
  source = "registry+https://github.com/rust-lang/crates.io-index"
488
- checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
465
+ checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
466
+ dependencies = [
467
+ "zerocopy",
468
+ ]
489
469
 
490
470
  [[package]]
491
471
  name = "proc-macro2"
492
- version = "1.0.81"
472
+ version = "1.0.86"
493
473
  source = "registry+https://github.com/rust-lang/crates.io-index"
494
- checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
474
+ checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
495
475
  dependencies = [
496
476
  "unicode-ident",
497
477
  ]
498
478
 
499
479
  [[package]]
500
480
  name = "quote"
501
- version = "1.0.36"
481
+ version = "1.0.37"
502
482
  source = "registry+https://github.com/rust-lang/crates.io-index"
503
- checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
483
+ checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
504
484
  dependencies = [
505
485
  "proc-macro2",
506
486
  ]
@@ -568,18 +548,18 @@ dependencies = [
568
548
 
569
549
  [[package]]
570
550
  name = "rb-sys"
571
- version = "0.9.97"
551
+ version = "0.9.102"
572
552
  source = "registry+https://github.com/rust-lang/crates.io-index"
573
- checksum = "47d30bcad206b51f2f66121190ca678dce1fdf3a2eae0ac5d838d1818b19bdf5"
553
+ checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
574
554
  dependencies = [
575
555
  "rb-sys-build",
576
556
  ]
577
557
 
578
558
  [[package]]
579
559
  name = "rb-sys-build"
580
- version = "0.9.97"
560
+ version = "0.9.102"
581
561
  source = "registry+https://github.com/rust-lang/crates.io-index"
582
- checksum = "3cbd92f281615f3c2dcb9dcb0f0576624752afbf9a7f99173b37c4b55b62dd8a"
562
+ checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
583
563
  dependencies = [
584
564
  "bindgen",
585
565
  "lazy_static",
@@ -598,9 +578,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
598
578
 
599
579
  [[package]]
600
580
  name = "regex"
601
- version = "1.10.4"
581
+ version = "1.10.6"
602
582
  source = "registry+https://github.com/rust-lang/crates.io-index"
603
- checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
583
+ checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
604
584
  dependencies = [
605
585
  "aho-corasick",
606
586
  "memchr",
@@ -610,9 +590,9 @@ dependencies = [
610
590
 
611
591
  [[package]]
612
592
  name = "regex-automata"
613
- version = "0.4.6"
593
+ version = "0.4.7"
614
594
  source = "registry+https://github.com/rust-lang/crates.io-index"
615
- checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
595
+ checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
616
596
  dependencies = [
617
597
  "aho-corasick",
618
598
  "memchr",
@@ -621,9 +601,9 @@ dependencies = [
621
601
 
622
602
  [[package]]
623
603
  name = "regex-syntax"
624
- version = "0.8.2"
604
+ version = "0.8.4"
625
605
  source = "registry+https://github.com/rust-lang/crates.io-index"
626
- checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
606
+ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
627
607
 
628
608
  [[package]]
629
609
  name = "rustc-hash"
@@ -633,15 +613,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
633
613
 
634
614
  [[package]]
635
615
  name = "ryu"
636
- version = "1.0.13"
637
- source = "registry+https://github.com/rust-lang/crates.io-index"
638
- checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
639
-
640
- [[package]]
641
- name = "scopeguard"
642
- version = "1.1.0"
616
+ version = "1.0.18"
643
617
  source = "registry+https://github.com/rust-lang/crates.io-index"
644
- checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
618
+ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
645
619
 
646
620
  [[package]]
647
621
  name = "seq-macro"
@@ -651,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
651
625
 
652
626
  [[package]]
653
627
  name = "serde"
654
- version = "1.0.188"
628
+ version = "1.0.209"
655
629
  source = "registry+https://github.com/rust-lang/crates.io-index"
656
- checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
630
+ checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
657
631
  dependencies = [
658
632
  "serde_derive",
659
633
  ]
660
634
 
661
635
  [[package]]
662
636
  name = "serde_derive"
663
- version = "1.0.188"
637
+ version = "1.0.209"
664
638
  source = "registry+https://github.com/rust-lang/crates.io-index"
665
- checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
639
+ checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
666
640
  dependencies = [
667
641
  "proc-macro2",
668
642
  "quote",
@@ -671,11 +645,12 @@ dependencies = [
671
645
 
672
646
  [[package]]
673
647
  name = "serde_json"
674
- version = "1.0.95"
648
+ version = "1.0.127"
675
649
  source = "registry+https://github.com/rust-lang/crates.io-index"
676
- checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
650
+ checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
677
651
  dependencies = [
678
652
  "itoa",
653
+ "memchr",
679
654
  "ryu",
680
655
  "serde",
681
656
  ]
@@ -688,15 +663,15 @@ checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde"
688
663
 
689
664
  [[package]]
690
665
  name = "shlex"
691
- version = "1.1.0"
666
+ version = "1.3.0"
692
667
  source = "registry+https://github.com/rust-lang/crates.io-index"
693
- checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
668
+ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
694
669
 
695
670
  [[package]]
696
671
  name = "smallvec"
697
- version = "1.10.0"
672
+ version = "1.13.2"
698
673
  source = "registry+https://github.com/rust-lang/crates.io-index"
699
- checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
674
+ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
700
675
 
701
676
  [[package]]
702
677
  name = "spm_precompiled"
@@ -712,15 +687,15 @@ dependencies = [
712
687
 
713
688
  [[package]]
714
689
  name = "strsim"
715
- version = "0.10.0"
690
+ version = "0.11.1"
716
691
  source = "registry+https://github.com/rust-lang/crates.io-index"
717
- checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
692
+ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
718
693
 
719
694
  [[package]]
720
695
  name = "syn"
721
- version = "2.0.59"
696
+ version = "2.0.76"
722
697
  source = "registry+https://github.com/rust-lang/crates.io-index"
723
- checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a"
698
+ checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
724
699
  dependencies = [
725
700
  "proc-macro2",
726
701
  "quote",
@@ -729,18 +704,18 @@ dependencies = [
729
704
 
730
705
  [[package]]
731
706
  name = "thiserror"
732
- version = "1.0.49"
707
+ version = "1.0.63"
733
708
  source = "registry+https://github.com/rust-lang/crates.io-index"
734
- checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4"
709
+ checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
735
710
  dependencies = [
736
711
  "thiserror-impl",
737
712
  ]
738
713
 
739
714
  [[package]]
740
715
  name = "thiserror-impl"
741
- version = "1.0.49"
716
+ version = "1.0.63"
742
717
  source = "registry+https://github.com/rust-lang/crates.io-index"
743
- checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
718
+ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
744
719
  dependencies = [
745
720
  "proc-macro2",
746
721
  "quote",
@@ -749,19 +724,19 @@ dependencies = [
749
724
 
750
725
  [[package]]
751
726
  name = "tokenizers"
752
- version = "0.5.0"
727
+ version = "0.5.2"
753
728
  dependencies = [
754
729
  "magnus",
755
730
  "onig",
756
731
  "serde",
757
- "tokenizers 0.19.1",
732
+ "tokenizers 0.20.0",
758
733
  ]
759
734
 
760
735
  [[package]]
761
736
  name = "tokenizers"
762
- version = "0.19.1"
737
+ version = "0.20.0"
763
738
  source = "registry+https://github.com/rust-lang/crates.io-index"
764
- checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
739
+ checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
765
740
  dependencies = [
766
741
  "aho-corasick",
767
742
  "derive_builder",
@@ -791,9 +766,9 @@ dependencies = [
791
766
 
792
767
  [[package]]
793
768
  name = "unicode-ident"
794
- version = "1.0.8"
769
+ version = "1.0.12"
795
770
  source = "registry+https://github.com/rust-lang/crates.io-index"
796
- checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
771
+ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
797
772
 
798
773
  [[package]]
799
774
  name = "unicode-normalization-alignments"
@@ -812,9 +787,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
812
787
 
813
788
  [[package]]
814
789
  name = "unicode-width"
815
- version = "0.1.10"
790
+ version = "0.1.13"
816
791
  source = "registry+https://github.com/rust-lang/crates.io-index"
817
- checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
792
+ checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
818
793
 
819
794
  [[package]]
820
795
  name = "unicode_categories"
@@ -829,36 +804,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
829
804
  checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
830
805
 
831
806
  [[package]]
832
- name = "winapi"
833
- version = "0.3.9"
807
+ name = "windows-sys"
808
+ version = "0.52.0"
834
809
  source = "registry+https://github.com/rust-lang/crates.io-index"
835
- checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
810
+ checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
836
811
  dependencies = [
837
- "winapi-i686-pc-windows-gnu",
838
- "winapi-x86_64-pc-windows-gnu",
812
+ "windows-targets",
839
813
  ]
840
814
 
841
815
  [[package]]
842
- name = "winapi-i686-pc-windows-gnu"
843
- version = "0.4.0"
816
+ name = "windows-targets"
817
+ version = "0.52.6"
844
818
  source = "registry+https://github.com/rust-lang/crates.io-index"
845
- checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
846
-
847
- [[package]]
848
- name = "winapi-x86_64-pc-windows-gnu"
849
- version = "0.4.0"
850
- source = "registry+https://github.com/rust-lang/crates.io-index"
851
- checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
852
-
853
- [[package]]
854
- name = "windows-sys"
855
- version = "0.42.0"
856
- source = "registry+https://github.com/rust-lang/crates.io-index"
857
- checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
819
+ checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
858
820
  dependencies = [
859
821
  "windows_aarch64_gnullvm",
860
822
  "windows_aarch64_msvc",
861
823
  "windows_i686_gnu",
824
+ "windows_i686_gnullvm",
862
825
  "windows_i686_msvc",
863
826
  "windows_x86_64_gnu",
864
827
  "windows_x86_64_gnullvm",
@@ -867,42 +830,69 @@ dependencies = [
867
830
 
868
831
  [[package]]
869
832
  name = "windows_aarch64_gnullvm"
870
- version = "0.42.2"
833
+ version = "0.52.6"
871
834
  source = "registry+https://github.com/rust-lang/crates.io-index"
872
- checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
835
+ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
873
836
 
874
837
  [[package]]
875
838
  name = "windows_aarch64_msvc"
876
- version = "0.42.2"
839
+ version = "0.52.6"
877
840
  source = "registry+https://github.com/rust-lang/crates.io-index"
878
- checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
841
+ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
879
842
 
880
843
  [[package]]
881
844
  name = "windows_i686_gnu"
882
- version = "0.42.2"
845
+ version = "0.52.6"
846
+ source = "registry+https://github.com/rust-lang/crates.io-index"
847
+ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
848
+
849
+ [[package]]
850
+ name = "windows_i686_gnullvm"
851
+ version = "0.52.6"
883
852
  source = "registry+https://github.com/rust-lang/crates.io-index"
884
- checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
853
+ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
885
854
 
886
855
  [[package]]
887
856
  name = "windows_i686_msvc"
888
- version = "0.42.2"
857
+ version = "0.52.6"
889
858
  source = "registry+https://github.com/rust-lang/crates.io-index"
890
- checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
859
+ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
891
860
 
892
861
  [[package]]
893
862
  name = "windows_x86_64_gnu"
894
- version = "0.42.2"
863
+ version = "0.52.6"
895
864
  source = "registry+https://github.com/rust-lang/crates.io-index"
896
- checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
865
+ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
897
866
 
898
867
  [[package]]
899
868
  name = "windows_x86_64_gnullvm"
900
- version = "0.42.2"
869
+ version = "0.52.6"
901
870
  source = "registry+https://github.com/rust-lang/crates.io-index"
902
- checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
871
+ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
903
872
 
904
873
  [[package]]
905
874
  name = "windows_x86_64_msvc"
906
- version = "0.42.2"
875
+ version = "0.52.6"
876
+ source = "registry+https://github.com/rust-lang/crates.io-index"
877
+ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
878
+
879
+ [[package]]
880
+ name = "zerocopy"
881
+ version = "0.7.35"
882
+ source = "registry+https://github.com/rust-lang/crates.io-index"
883
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
884
+ dependencies = [
885
+ "byteorder",
886
+ "zerocopy-derive",
887
+ ]
888
+
889
+ [[package]]
890
+ name = "zerocopy-derive"
891
+ version = "0.7.35"
907
892
  source = "registry+https://github.com/rust-lang/crates.io-index"
908
- checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
893
+ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
894
+ dependencies = [
895
+ "proc-macro2",
896
+ "quote",
897
+ "syn",
898
+ ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.5.0"
3
+ version = "0.5.2"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -11,11 +11,11 @@ publish = false
11
11
  crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
- magnus = "0.6"
14
+ magnus = "0.7"
15
15
  onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.19.1" # also update in from_pretrained.rb
19
+ version = "=0.20.0" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
34
34
  }
35
35
  }
36
36
 
37
+ impl RbDecoder {
38
+ pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
39
+ self.decoder.decode(tokens).map_err(RbError::from)
40
+ }
41
+ }
42
+
37
43
  macro_rules! getter {
38
44
  ($self: ident, $variant: ident, $($name: tt)+) => {{
39
45
  let decoder = &$self.decoder;
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
358
364
 
359
365
  pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
360
366
  let decoder = module.define_class("Decoder", ruby.class_object())?;
367
+ decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
361
368
 
362
369
  let class = module.define_class("BPEDecoder", decoder)?;
363
370
  class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
42
42
 
43
43
  let class = module.define_class("Tokenizer", ruby.class_object())?;
44
44
  class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
45
+ class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
45
46
  class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
46
47
  class.define_method(
47
48
  "add_special_tokens",
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
54
55
  class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
55
56
  class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
56
57
  class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
58
+ class.define_method("model", method!(RbTokenizer::get_model, 0))?;
59
+ class.define_method("model=", method!(RbTokenizer::set_model,1))?;
60
+ class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
57
61
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
62
+ class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
58
63
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
64
+ class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
59
65
  class.define_method(
60
66
  "post_processor=",
61
67
  method!(RbTokenizer::set_post_processor, 1),
62
68
  )?;
69
+ class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
63
70
  class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
64
71
  class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
65
72
  class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
@@ -222,8 +222,8 @@ pub struct RbSequence {}
222
222
  impl RbSequence {
223
223
  fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
224
224
  let mut sequence = Vec::with_capacity(normalizers.len());
225
- for n in normalizers.each() {
226
- let normalizer: &RbNormalizer = TryConvert::try_convert(n?)?;
225
+ for n in normalizers.into_iter() {
226
+ let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
227
227
  match &normalizer.normalizer {
228
228
  RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
229
229
  RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
@@ -258,8 +258,8 @@ pub struct RbSequence {}
258
258
  impl RbSequence {
259
259
  fn new(pre_tokenizers: RArray) -> RbResult<RbPreTokenizer> {
260
260
  let mut sequence = Vec::with_capacity(pre_tokenizers.len());
261
- for n in pre_tokenizers.each() {
262
- let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n?)?;
261
+ for n in pre_tokenizers.into_iter() {
262
+ let pretokenizer: &RbPreTokenizer = TryConvert::try_convert(n)?;
263
263
  match &pretokenizer.pretok {
264
264
  RbPreTokenizerTypeWrapper::Sequence(inner) => {
265
265
  sequence.extend(inner.iter().cloned())
@@ -1,9 +1,10 @@
1
1
  use std::cell::RefCell;
2
2
  use std::collections::HashMap;
3
3
  use std::path::PathBuf;
4
+ use std::str::FromStr;
4
5
 
5
6
  use magnus::prelude::*;
6
- use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
7
+ use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
7
8
  use tk::tokenizer::{
8
9
  Model, PaddingDirection, PaddingParams, PaddingStrategy,
9
10
  TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
@@ -203,6 +204,14 @@ impl RbTokenizer {
203
204
  RbTokenizer::new(TokenizerImpl::new(model.clone()))
204
205
  }
205
206
 
207
+ pub fn from_str(json: RString) -> RbResult<Self> {
208
+ Tokenizer::from_str(unsafe { json.as_str()? })
209
+ .map(|v| RbTokenizer {
210
+ tokenizer: RefCell::new(v),
211
+ })
212
+ .map_err(RbError::from)
213
+ }
214
+
206
215
  pub fn from_file(path: PathBuf) -> RbResult<Self> {
207
216
  Tokenizer::from_file(path)
208
217
  .map(|v| RbTokenizer {
@@ -282,12 +291,12 @@ impl RbTokenizer {
282
291
  add_special_tokens: bool,
283
292
  ) -> RbResult<RArray> {
284
293
  let input: Vec<tk::EncodeInput> = input
285
- .each()
294
+ .into_iter()
286
295
  .map(|o| {
287
296
  let input: tk::EncodeInput = if is_pretokenized {
288
- PreTokenizedEncodeInput::try_convert(o?)?.into()
297
+ PreTokenizedEncodeInput::try_convert(o)?.into()
289
298
  } else {
290
- TextEncodeInput::try_convert(o?)?.into()
299
+ TextEncodeInput::try_convert(o)?.into()
291
300
  };
292
301
  Ok(input)
293
302
  })
@@ -319,26 +328,50 @@ impl RbTokenizer {
319
328
  .map_err(RbError::from)
320
329
  }
321
330
 
322
- pub fn set_decoder(&self, decoder: &RbDecoder) {
323
- self.tokenizer.borrow_mut().with_decoder(decoder.clone());
331
+ pub fn get_model(&self) -> RbModel {
332
+ self.tokenizer.borrow().get_model().clone()
333
+ }
334
+
335
+ pub fn set_model(&self, model: &RbModel) {
336
+ self.tokenizer.borrow_mut().with_model(model.clone());
337
+ }
338
+
339
+ pub fn get_decoder(&self) -> Option<RbDecoder> {
340
+ self.tokenizer.borrow().get_decoder().cloned()
341
+ }
342
+
343
+ pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
344
+ self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
345
+ }
346
+
347
+ pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
348
+ self.tokenizer.borrow().get_pre_tokenizer().cloned()
324
349
  }
325
350
 
326
- pub fn set_pre_tokenizer(&self, pretok: &RbPreTokenizer) {
351
+ pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
327
352
  self.tokenizer
328
353
  .borrow_mut()
329
- .with_pre_tokenizer(pretok.clone());
354
+ .with_pre_tokenizer(pretok.cloned());
330
355
  }
331
356
 
332
- pub fn set_post_processor(&self, processor: &RbPostProcessor) {
357
+ pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
358
+ self.tokenizer.borrow().get_post_processor().cloned()
359
+ }
360
+
361
+ pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
333
362
  self.tokenizer
334
363
  .borrow_mut()
335
- .with_post_processor(processor.clone());
364
+ .with_post_processor(processor.cloned());
365
+ }
366
+
367
+ pub fn get_normalizer(&self) -> Option<RbNormalizer> {
368
+ self.tokenizer.borrow().get_normalizer().cloned()
336
369
  }
337
370
 
338
- pub fn set_normalizer(&self, normalizer: &RbNormalizer) {
371
+ pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
339
372
  self.tokenizer
340
373
  .borrow_mut()
341
- .with_normalizer(normalizer.clone());
374
+ .with_normalizer(normalizer.cloned());
342
375
  }
343
376
 
344
377
  pub fn token_to_id(&self, token: String) -> Option<u32> {
@@ -110,9 +110,9 @@ impl RbTrainer {
110
110
  BpeTrainer,
111
111
  special_tokens,
112
112
  special_tokens
113
- .each()
113
+ .into_iter()
114
114
  .map(|token| {
115
- if let Ok(content) = String::try_convert(token?) {
115
+ if let Ok(content) = String::try_convert(token) {
116
116
  Ok(RbAddedToken::from(content, Some(true)).get_token())
117
117
  } else {
118
118
  todo!()
@@ -197,9 +197,9 @@ impl RbTrainer {
197
197
  UnigramTrainer,
198
198
  special_tokens,
199
199
  special_tokens
200
- .each()
200
+ .into_iter()
201
201
  .map(|token| {
202
- if let Ok(content) = String::try_convert(token?) {
202
+ if let Ok(content) = String::try_convert(token) {
203
203
  Ok(RbAddedToken::from(content, Some(true)).get_token())
204
204
  } else {
205
205
  todo!()
@@ -268,9 +268,9 @@ impl RbTrainer {
268
268
  WordLevelTrainer,
269
269
  special_tokens,
270
270
  special_tokens
271
- .each()
271
+ .into_iter()
272
272
  .map(|token| {
273
- if let Ok(content) = String::try_convert(token?) {
273
+ if let Ok(content) = String::try_convert(token) {
274
274
  Ok(RbAddedToken::from(content, Some(true)).get_token())
275
275
  } else {
276
276
  todo!()
@@ -322,9 +322,9 @@ impl RbTrainer {
322
322
  WordPieceTrainer,
323
323
  @set_special_tokens,
324
324
  special_tokens
325
- .each()
325
+ .into_iter()
326
326
  .map(|token| {
327
- if let Ok(content) = String::try_convert(token?) {
327
+ if let Ok(content) = String::try_convert(token) {
328
328
  Ok(RbAddedToken::from(content, Some(true)).get_token())
329
329
  } else {
330
330
  todo!()
@@ -398,9 +398,9 @@ impl RbBpeTrainer {
398
398
  if !value.is_nil() {
399
399
  builder = builder.special_tokens(
400
400
  RArray::try_convert(value)?
401
- .each()
401
+ .into_iter()
402
402
  .map(|token| {
403
- if let Ok(content) = String::try_convert(token?) {
403
+ if let Ok(content) = String::try_convert(token) {
404
404
  Ok(RbAddedToken::from(content, Some(true)).get_token())
405
405
  } else {
406
406
  todo!()
@@ -466,9 +466,9 @@ impl RbUnigramTrainer {
466
466
  if !value.is_nil() {
467
467
  builder.special_tokens(
468
468
  RArray::try_convert(value)?
469
- .each()
469
+ .into_iter()
470
470
  .map(|token| {
471
- if let Ok(content) = String::try_convert(token?) {
471
+ if let Ok(content) = String::try_convert(token) {
472
472
  Ok(RbAddedToken::from(content, Some(true)).get_token())
473
473
  } else {
474
474
  todo!()
@@ -540,9 +540,9 @@ impl RbWordLevelTrainer {
540
540
  if !value.is_nil() {
541
541
  builder.special_tokens(
542
542
  RArray::try_convert(value)?
543
- .each()
543
+ .into_iter()
544
544
  .map(|token| {
545
- if let Ok(content) = String::try_convert(token?) {
545
+ if let Ok(content) = String::try_convert(token) {
546
546
  Ok(RbAddedToken::from(content, Some(true)).get_token())
547
547
  } else {
548
548
  todo!()
@@ -581,9 +581,9 @@ impl RbWordPieceTrainer {
581
581
  if !value.is_nil() {
582
582
  builder = builder.special_tokens(
583
583
  RArray::try_convert(value)?
584
- .each()
584
+ .into_iter()
585
585
  .map(|token| {
586
- if let Ok(content) = String::try_convert(token?) {
586
+ if let Ok(content) = String::try_convert(token) {
587
587
  Ok(RbAddedToken::from(content, Some(true)).get_token())
588
588
  } else {
589
589
  todo!()
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.19.1"
4
+ TOKENIZERS_VERSION = "0.20.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -67,7 +67,7 @@ module Tokenizers
67
67
  end
68
68
  end
69
69
 
70
- options[:content_length_proc] = -> (_) { puts "Downloading..." }
70
+ options[:content_length_proc] = ->(_) { puts "Downloading..." }
71
71
 
72
72
  # string options are headers
73
73
  tempfile = URI.parse(url).open(headers.merge(options))
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-05-21 00:00:00.000000000 Z
11
+ date: 2024-08-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.5.9
103
+ rubygems_version: 3.5.11
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby