tokenizers 0.4.1 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ae078880dfee0d026206156174a482b7e5345aea4784bb4a3e1298c499dd0e3d
4
- data.tar.gz: baedf2cd55c0b4332232924bc2439e8ab9f6ba6703794e376f7f34f5724717c2
3
+ metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
4
+ data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
5
5
  SHA512:
6
- metadata.gz: 6292155935e06d70b9ab862d2493154ec21f3cc1ec9a7188e00517f026a3d79460f84b08c9701b6eab2b758ab27ce2a5a4fb90c517ec7a1817f5de31a0b95324
7
- data.tar.gz: 99b04f81650ae8b12be1e82dc8989a37d9d90542cb461c7fadf0e618f8ac4592b614fa357a462d2e71cb8833e058678ff6e5e5d421b825c969559f5569c89cd5
6
+ metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
7
+ data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.4.3 (2024-01-03)
2
+
3
+ - Added support for Ruby 3.3
4
+
5
+ ## 0.4.2 (2023-11-16)
6
+
7
+ - Updated Tokenizers to 0.15.0
8
+ - Fixed issue with download caching
9
+
1
10
  ## 0.4.1 (2023-10-05)
2
11
 
3
12
  - Fixed error loading gem
data/Cargo.lock CHANGED
@@ -4,18 +4,9 @@ version = 3
4
4
 
5
5
  [[package]]
6
6
  name = "aho-corasick"
7
- version = "0.7.20"
7
+ version = "1.1.1"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
- checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
10
- dependencies = [
11
- "memchr",
12
- ]
13
-
14
- [[package]]
15
- name = "aho-corasick"
16
- version = "1.0.5"
17
- source = "registry+https://github.com/rust-lang/crates.io-index"
18
- checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
9
+ checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
19
10
  dependencies = [
20
11
  "memchr",
21
12
  ]
@@ -34,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
34
25
 
35
26
  [[package]]
36
27
  name = "bindgen"
37
- version = "0.62.0"
28
+ version = "0.69.1"
38
29
  source = "registry+https://github.com/rust-lang/crates.io-index"
39
- checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
30
+ checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
40
31
  dependencies = [
41
- "bitflags",
32
+ "bitflags 2.4.1",
42
33
  "cexpr",
43
34
  "clang-sys",
44
35
  "lazy_static",
@@ -49,7 +40,7 @@ dependencies = [
49
40
  "regex",
50
41
  "rustc-hash",
51
42
  "shlex",
52
- "syn 1.0.109",
43
+ "syn 2.0.38",
53
44
  ]
54
45
 
55
46
  [[package]]
@@ -58,6 +49,12 @@ version = "1.3.2"
58
49
  source = "registry+https://github.com/rust-lang/crates.io-index"
59
50
  checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
60
51
 
52
+ [[package]]
53
+ name = "bitflags"
54
+ version = "2.4.1"
55
+ source = "registry+https://github.com/rust-lang/crates.io-index"
56
+ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
57
+
61
58
  [[package]]
62
59
  name = "cc"
63
60
  version = "1.0.79"
@@ -103,16 +100,6 @@ dependencies = [
103
100
  "windows-sys",
104
101
  ]
105
102
 
106
- [[package]]
107
- name = "crossbeam-channel"
108
- version = "0.5.8"
109
- source = "registry+https://github.com/rust-lang/crates.io-index"
110
- checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
111
- dependencies = [
112
- "cfg-if",
113
- "crossbeam-utils",
114
- ]
115
-
116
103
  [[package]]
117
104
  name = "crossbeam-deque"
118
105
  version = "0.8.3"
@@ -226,9 +213,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
226
213
 
227
214
  [[package]]
228
215
  name = "esaxx-rs"
229
- version = "0.1.8"
216
+ version = "0.1.10"
230
217
  source = "registry+https://github.com/rust-lang/crates.io-index"
231
- checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35"
218
+ checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6"
232
219
  dependencies = [
233
220
  "cc",
234
221
  ]
@@ -241,9 +228,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
241
228
 
242
229
  [[package]]
243
230
  name = "getrandom"
244
- version = "0.2.9"
231
+ version = "0.2.10"
245
232
  source = "registry+https://github.com/rust-lang/crates.io-index"
246
- checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
233
+ checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
247
234
  dependencies = [
248
235
  "cfg-if",
249
236
  "libc",
@@ -256,15 +243,6 @@ version = "0.3.1"
256
243
  source = "registry+https://github.com/rust-lang/crates.io-index"
257
244
  checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
258
245
 
259
- [[package]]
260
- name = "hermit-abi"
261
- version = "0.2.6"
262
- source = "registry+https://github.com/rust-lang/crates.io-index"
263
- checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
264
- dependencies = [
265
- "libc",
266
- ]
267
-
268
246
  [[package]]
269
247
  name = "ident_case"
270
248
  version = "1.0.1"
@@ -273,30 +251,31 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
273
251
 
274
252
  [[package]]
275
253
  name = "indicatif"
276
- version = "0.15.0"
254
+ version = "0.17.7"
277
255
  source = "registry+https://github.com/rust-lang/crates.io-index"
278
- checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4"
256
+ checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25"
279
257
  dependencies = [
280
258
  "console",
281
- "lazy_static",
259
+ "instant",
282
260
  "number_prefix",
283
- "regex",
261
+ "portable-atomic",
262
+ "unicode-width",
284
263
  ]
285
264
 
286
265
  [[package]]
287
- name = "itertools"
288
- version = "0.8.2"
266
+ name = "instant"
267
+ version = "0.1.12"
289
268
  source = "registry+https://github.com/rust-lang/crates.io-index"
290
- checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
269
+ checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
291
270
  dependencies = [
292
- "either",
271
+ "cfg-if",
293
272
  ]
294
273
 
295
274
  [[package]]
296
275
  name = "itertools"
297
- version = "0.9.0"
276
+ version = "0.11.0"
298
277
  source = "registry+https://github.com/rust-lang/crates.io-index"
299
- checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
278
+ checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
300
279
  dependencies = [
301
280
  "either",
302
281
  ]
@@ -321,9 +300,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
321
300
 
322
301
  [[package]]
323
302
  name = "libc"
324
- version = "0.2.141"
303
+ version = "0.2.149"
325
304
  source = "registry+https://github.com/rust-lang/crates.io-index"
326
- checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
305
+ checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
327
306
 
328
307
  [[package]]
329
308
  name = "libloading"
@@ -346,9 +325,9 @@ dependencies = [
346
325
 
347
326
  [[package]]
348
327
  name = "macro_rules_attribute"
349
- version = "0.1.3"
328
+ version = "0.2.0"
350
329
  source = "registry+https://github.com/rust-lang/crates.io-index"
351
- checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862"
330
+ checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13"
352
331
  dependencies = [
353
332
  "macro_rules_attribute-proc_macro",
354
333
  "paste",
@@ -356,15 +335,15 @@ dependencies = [
356
335
 
357
336
  [[package]]
358
337
  name = "macro_rules_attribute-proc_macro"
359
- version = "0.1.3"
338
+ version = "0.2.0"
360
339
  source = "registry+https://github.com/rust-lang/crates.io-index"
361
- checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
340
+ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
362
341
 
363
342
  [[package]]
364
343
  name = "magnus"
365
- version = "0.6.0"
344
+ version = "0.6.2"
366
345
  source = "registry+https://github.com/rust-lang/crates.io-index"
367
- checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
346
+ checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
368
347
  dependencies = [
369
348
  "magnus-macros",
370
349
  "rb-sys",
@@ -380,7 +359,7 @@ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
380
359
  dependencies = [
381
360
  "proc-macro2",
382
361
  "quote",
383
- "syn 2.0.13",
362
+ "syn 2.0.38",
384
363
  ]
385
364
 
386
365
  [[package]]
@@ -406,9 +385,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
406
385
 
407
386
  [[package]]
408
387
  name = "monostate"
409
- version = "0.1.6"
388
+ version = "0.1.9"
410
389
  source = "registry+https://github.com/rust-lang/crates.io-index"
411
- checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
390
+ checksum = "15f370ae88093ec6b11a710dec51321a61d420fafd1bad6e30d01bd9c920e8ee"
412
391
  dependencies = [
413
392
  "monostate-impl",
414
393
  "serde",
@@ -416,13 +395,13 @@ dependencies = [
416
395
 
417
396
  [[package]]
418
397
  name = "monostate-impl"
419
- version = "0.1.6"
398
+ version = "0.1.9"
420
399
  source = "registry+https://github.com/rust-lang/crates.io-index"
421
- checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
400
+ checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce"
422
401
  dependencies = [
423
402
  "proc-macro2",
424
403
  "quote",
425
- "syn 2.0.13",
404
+ "syn 2.0.38",
426
405
  ]
427
406
 
428
407
  [[package]]
@@ -435,21 +414,11 @@ dependencies = [
435
414
  "minimal-lexical",
436
415
  ]
437
416
 
438
- [[package]]
439
- name = "num_cpus"
440
- version = "1.15.0"
441
- source = "registry+https://github.com/rust-lang/crates.io-index"
442
- checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
443
- dependencies = [
444
- "hermit-abi",
445
- "libc",
446
- ]
447
-
448
417
  [[package]]
449
418
  name = "number_prefix"
450
- version = "0.3.0"
419
+ version = "0.4.0"
451
420
  source = "registry+https://github.com/rust-lang/crates.io-index"
452
- checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
421
+ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
453
422
 
454
423
  [[package]]
455
424
  name = "once_cell"
@@ -463,7 +432,7 @@ version = "6.4.0"
463
432
  source = "registry+https://github.com/rust-lang/crates.io-index"
464
433
  checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
465
434
  dependencies = [
466
- "bitflags",
435
+ "bitflags 1.3.2",
467
436
  "libc",
468
437
  "once_cell",
469
438
  "onig_sys",
@@ -481,9 +450,9 @@ dependencies = [
481
450
 
482
451
  [[package]]
483
452
  name = "paste"
484
- version = "1.0.12"
453
+ version = "1.0.14"
485
454
  source = "registry+https://github.com/rust-lang/crates.io-index"
486
- checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
455
+ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
487
456
 
488
457
  [[package]]
489
458
  name = "peeking_take_while"
@@ -497,6 +466,12 @@ version = "0.3.26"
497
466
  source = "registry+https://github.com/rust-lang/crates.io-index"
498
467
  checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
499
468
 
469
+ [[package]]
470
+ name = "portable-atomic"
471
+ version = "1.4.3"
472
+ source = "registry+https://github.com/rust-lang/crates.io-index"
473
+ checksum = "31114a898e107c51bb1609ffaf55a0e011cf6a4d7f1170d0015a165082c0338b"
474
+
500
475
  [[package]]
501
476
  name = "ppv-lite86"
502
477
  version = "0.2.17"
@@ -505,18 +480,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
505
480
 
506
481
  [[package]]
507
482
  name = "proc-macro2"
508
- version = "1.0.56"
483
+ version = "1.0.68"
509
484
  source = "registry+https://github.com/rust-lang/crates.io-index"
510
- checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
485
+ checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c"
511
486
  dependencies = [
512
487
  "unicode-ident",
513
488
  ]
514
489
 
515
490
  [[package]]
516
491
  name = "quote"
517
- version = "1.0.26"
492
+ version = "1.0.33"
518
493
  source = "registry+https://github.com/rust-lang/crates.io-index"
519
- checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
494
+ checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
520
495
  dependencies = [
521
496
  "proc-macro2",
522
497
  ]
@@ -553,9 +528,9 @@ dependencies = [
553
528
 
554
529
  [[package]]
555
530
  name = "rayon"
556
- version = "1.7.0"
531
+ version = "1.8.0"
557
532
  source = "registry+https://github.com/rust-lang/crates.io-index"
558
- checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
533
+ checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1"
559
534
  dependencies = [
560
535
  "either",
561
536
  "rayon-core",
@@ -563,41 +538,39 @@ dependencies = [
563
538
 
564
539
  [[package]]
565
540
  name = "rayon-cond"
566
- version = "0.1.0"
541
+ version = "0.3.0"
567
542
  source = "registry+https://github.com/rust-lang/crates.io-index"
568
- checksum = "fd1259362c9065e5ea39a789ef40b1e3fd934c94beb7b5ab3ac6629d3b5e7cb7"
543
+ checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
569
544
  dependencies = [
570
545
  "either",
571
- "itertools 0.8.2",
546
+ "itertools",
572
547
  "rayon",
573
548
  ]
574
549
 
575
550
  [[package]]
576
551
  name = "rayon-core"
577
- version = "1.11.0"
552
+ version = "1.12.0"
578
553
  source = "registry+https://github.com/rust-lang/crates.io-index"
579
- checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
554
+ checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed"
580
555
  dependencies = [
581
- "crossbeam-channel",
582
556
  "crossbeam-deque",
583
557
  "crossbeam-utils",
584
- "num_cpus",
585
558
  ]
586
559
 
587
560
  [[package]]
588
561
  name = "rb-sys"
589
- version = "0.9.79"
562
+ version = "0.9.86"
590
563
  source = "registry+https://github.com/rust-lang/crates.io-index"
591
- checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
564
+ checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
592
565
  dependencies = [
593
566
  "rb-sys-build",
594
567
  ]
595
568
 
596
569
  [[package]]
597
570
  name = "rb-sys-build"
598
- version = "0.9.79"
571
+ version = "0.9.86"
599
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
600
- checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
573
+ checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
601
574
  dependencies = [
602
575
  "bindgen",
603
576
  "lazy_static",
@@ -605,7 +578,7 @@ dependencies = [
605
578
  "quote",
606
579
  "regex",
607
580
  "shell-words",
608
- "syn 1.0.109",
581
+ "syn 2.0.38",
609
582
  ]
610
583
 
611
584
  [[package]]
@@ -620,7 +593,7 @@ version = "1.9.5"
620
593
  source = "registry+https://github.com/rust-lang/crates.io-index"
621
594
  checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
622
595
  dependencies = [
623
- "aho-corasick 1.0.5",
596
+ "aho-corasick",
624
597
  "memchr",
625
598
  "regex-automata",
626
599
  "regex-syntax",
@@ -632,7 +605,7 @@ version = "0.3.8"
632
605
  source = "registry+https://github.com/rust-lang/crates.io-index"
633
606
  checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
634
607
  dependencies = [
635
- "aho-corasick 1.0.5",
608
+ "aho-corasick",
636
609
  "memchr",
637
610
  "regex-syntax",
638
611
  ]
@@ -669,22 +642,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
669
642
 
670
643
  [[package]]
671
644
  name = "serde"
672
- version = "1.0.159"
645
+ version = "1.0.188"
673
646
  source = "registry+https://github.com/rust-lang/crates.io-index"
674
- checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
647
+ checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
675
648
  dependencies = [
676
649
  "serde_derive",
677
650
  ]
678
651
 
679
652
  [[package]]
680
653
  name = "serde_derive"
681
- version = "1.0.159"
654
+ version = "1.0.188"
682
655
  source = "registry+https://github.com/rust-lang/crates.io-index"
683
- checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
656
+ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
684
657
  dependencies = [
685
658
  "proc-macro2",
686
659
  "quote",
687
- "syn 2.0.13",
660
+ "syn 2.0.38",
688
661
  ]
689
662
 
690
663
  [[package]]
@@ -747,9 +720,9 @@ dependencies = [
747
720
 
748
721
  [[package]]
749
722
  name = "syn"
750
- version = "2.0.13"
723
+ version = "2.0.38"
751
724
  source = "registry+https://github.com/rust-lang/crates.io-index"
752
- checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec"
725
+ checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
753
726
  dependencies = [
754
727
  "proc-macro2",
755
728
  "quote",
@@ -758,46 +731,46 @@ dependencies = [
758
731
 
759
732
  [[package]]
760
733
  name = "thiserror"
761
- version = "1.0.40"
734
+ version = "1.0.49"
762
735
  source = "registry+https://github.com/rust-lang/crates.io-index"
763
- checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
736
+ checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4"
764
737
  dependencies = [
765
738
  "thiserror-impl",
766
739
  ]
767
740
 
768
741
  [[package]]
769
742
  name = "thiserror-impl"
770
- version = "1.0.40"
743
+ version = "1.0.49"
771
744
  source = "registry+https://github.com/rust-lang/crates.io-index"
772
- checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
745
+ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
773
746
  dependencies = [
774
747
  "proc-macro2",
775
748
  "quote",
776
- "syn 2.0.13",
749
+ "syn 2.0.38",
777
750
  ]
778
751
 
779
752
  [[package]]
780
753
  name = "tokenizers"
781
- version = "0.4.1"
754
+ version = "0.4.3"
782
755
  dependencies = [
783
756
  "magnus",
784
757
  "onig",
785
758
  "serde",
786
- "tokenizers 0.14.0",
759
+ "tokenizers 0.15.0",
787
760
  ]
788
761
 
789
762
  [[package]]
790
763
  name = "tokenizers"
791
- version = "0.14.0"
764
+ version = "0.15.0"
792
765
  source = "registry+https://github.com/rust-lang/crates.io-index"
793
- checksum = "12b515a66453a4d68f03398054f7204fd0dde6b93d3f20ea90b08025ab49b499"
766
+ checksum = "062b8a9613d6017633b80fb55fbb33f1aff006c36225a3025630753398034b3c"
794
767
  dependencies = [
795
- "aho-corasick 0.7.20",
768
+ "aho-corasick",
796
769
  "derive_builder",
797
770
  "esaxx-rs",
798
771
  "getrandom",
799
772
  "indicatif",
800
- "itertools 0.9.0",
773
+ "itertools",
801
774
  "lazy_static",
802
775
  "log",
803
776
  "macro_rules_attribute",
data/Cargo.toml CHANGED
@@ -1,5 +1,6 @@
1
1
  [workspace]
2
2
  members = ["ext/tokenizers"]
3
+ resolver = "2"
3
4
 
4
5
  [profile.release]
5
6
  strip = true
data/README.md CHANGED
@@ -34,15 +34,51 @@ Decode
34
34
  tokenizer.decode(ids)
35
35
  ```
36
36
 
37
- Load a tokenizer from files
37
+ ## Training
38
+
39
+ Create a tokenizer
38
40
 
39
41
  ```ruby
40
- tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
42
+ tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
41
43
  ```
42
44
 
43
- ## Training
45
+ Set the pre-tokenizer
46
+
47
+ ```ruby
48
+ tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
49
+ ```
50
+
51
+ Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
52
+
53
+ ```ruby
54
+ trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
55
+ tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
56
+ ```
57
+
58
+ Encode
59
+
60
+ ```ruby
61
+ output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
62
+ output.tokens
63
+ ```
64
+
65
+ Save the tokenizer to a file
66
+
67
+ ```ruby
68
+ tokenizer.save("tokenizer.json")
69
+ ```
70
+
71
+ Load a tokenizer from a file
72
+
73
+ ```ruby
74
+ tokenizer = Tokenizers.from_file("tokenizer.json")
75
+ ```
76
+
77
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
78
+
79
+ ## API
44
80
 
45
- Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
81
+ This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
46
82
 
47
83
  ## History
48
84
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.1"
3
+ version = "0.4.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.14.0" # also update in from_pretrained.rb
19
+ version = "=0.15.0" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.14.0"
4
+ TOKENIZERS_VERSION = "0.15.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -11,25 +11,27 @@ module Tokenizers
11
11
  require "digest"
12
12
  require "fileutils"
13
13
  require "json"
14
+ require "net/http"
14
15
  require "open-uri"
15
16
 
16
17
  cache_dir = ensure_cache_dir
17
18
 
18
- # string options are headers
19
19
  options = {
20
20
  open_timeout: 3,
21
- read_timeout: 30,
21
+ read_timeout: 30
22
+ }
23
+ headers = {
22
24
  "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
25
  }
24
26
  if auth_token
25
- options["Authorization"] = "Bearer #{auth_token}"
27
+ headers["Authorization"] = "Bearer #{auth_token}"
26
28
  end
27
29
 
28
30
  url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
31
 
30
32
  path =
31
33
  begin
32
- cached_path(cache_dir, url, options)
34
+ cached_path(cache_dir, url, headers, options)
33
35
  rescue OpenURI::HTTPError
34
36
  raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
37
  end
@@ -41,7 +43,7 @@ module Tokenizers
41
43
 
42
44
  # use same storage format as Rust version
43
45
  # https://github.com/epwalsh/rust-cached-path
44
- def cached_path(cache_dir, url, options)
46
+ def cached_path(cache_dir, url, headers, options)
45
47
  fsum = Digest::SHA256.hexdigest(url)
46
48
  meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
49
  meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
@@ -50,21 +52,25 @@ module Tokenizers
50
52
  if etag
51
53
  esum = Digest::SHA256.hexdigest(etag)
52
54
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
- options["If-None-Match"] = etag if File.exist?(resource_path)
55
+ if File.exist?(resource_path)
56
+ uri = URI(url)
57
+ req = Net::HTTP::Head.new(uri)
58
+ headers.each do |k, v|
59
+ req[k] = v
60
+ end
61
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
+ http.request(req)
63
+ end
64
+ if res["etag"] == etag
65
+ return resource_path
66
+ end
67
+ end
54
68
  end
55
69
 
56
70
  options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
71
 
58
- tempfile =
59
- begin
60
- URI.parse(url).open(options)
61
- rescue OpenURI::HTTPError => e
62
- if e.message == "304 Not Modified"
63
- return resource_path
64
- else
65
- raise e
66
- end
67
- end
72
+ # string options are headers
73
+ tempfile = URI.parse(url).open(headers.merge(options))
68
74
 
69
75
  etag = tempfile.meta["etag"]
70
76
  esum = Digest::SHA256.hexdigest(etag)
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-05 00:00:00.000000000 Z
11
+ date: 2024-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.4.10
103
+ rubygems_version: 3.5.3
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby