tokenizers 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 11fdc53989bc2285096bc5eb4a971426e153375ada43bc8c51d2a191b42fe02d
4
- data.tar.gz: 92d4d8bef2c4013d5cf0d55bff30bc070ca6e59430f87b96f9147075216a1c1d
3
+ metadata.gz: e6e88ec5618e36e317434410c960603695806bb59dadb2252f2957d8dbf0525b
4
+ data.tar.gz: 33a04a4a5faada27e6e7246c16d836a4ff9f6793e89de3cfd4880e30c6c8ed0d
5
5
  SHA512:
6
- metadata.gz: 7c91c33078b6c5b23a6080908fa184e2222922f35bd6e1c439329525f4554f939cf98ca4dcd3017d858cdb6ec2a96f6671ce5dc425ca643d482b49b04af00f4c
7
- data.tar.gz: 74aad6c458792570ace93107aacccb8a9df7b61228dd062bd7bcaf015a53d4f6b4b1fcace09c69bc959bd9b04598d315344d4edc7e6abbe1603f3b92d29ab711
6
+ metadata.gz: 88e4f2ad57fd1d66cd5fcf0d8b7ff6b1ea902258296fb02d207a446032134189e3445a104658074e94f914331c94f46cfdd09eed7c745c0483cb3b32b09e6abf
7
+ data.tar.gz: e8a1721ecbd36874322477077331743b0d1ba2de6f90076e07ad5456c230f76625d7f28ed6e6026c11395c6bb27701a6b8c0feedf2050387d32d9b777baa51fe
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.3.3 (2023-04-09)
2
+
3
+ - Updated Tokenizers to 0.13.3
4
+ - Added `ByteFallback`, `Fuse`, `Replace`, and `Strip` decoders
5
+ - Added `Prepend` normalizer
6
+
1
7
  ## 0.3.2 (2023-03-06)
2
8
 
3
9
  - Added precompiled gem for Linux x86-64 MUSL
data/Cargo.lock CHANGED
@@ -71,9 +71,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
71
71
 
72
72
  [[package]]
73
73
  name = "clang-sys"
74
- version = "1.4.0"
74
+ version = "1.6.1"
75
75
  source = "registry+https://github.com/rust-lang/crates.io-index"
76
- checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3"
76
+ checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
77
77
  dependencies = [
78
78
  "glob",
79
79
  "libc",
@@ -95,9 +95,9 @@ dependencies = [
95
95
 
96
96
  [[package]]
97
97
  name = "crossbeam-channel"
98
- version = "0.5.6"
98
+ version = "0.5.8"
99
99
  source = "registry+https://github.com/rust-lang/crates.io-index"
100
- checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
100
+ checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
101
101
  dependencies = [
102
102
  "cfg-if",
103
103
  "crossbeam-utils",
@@ -105,9 +105,9 @@ dependencies = [
105
105
 
106
106
  [[package]]
107
107
  name = "crossbeam-deque"
108
- version = "0.8.2"
108
+ version = "0.8.3"
109
109
  source = "registry+https://github.com/rust-lang/crates.io-index"
110
- checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
110
+ checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
111
111
  dependencies = [
112
112
  "cfg-if",
113
113
  "crossbeam-epoch",
@@ -116,9 +116,9 @@ dependencies = [
116
116
 
117
117
  [[package]]
118
118
  name = "crossbeam-epoch"
119
- version = "0.9.13"
119
+ version = "0.9.14"
120
120
  source = "registry+https://github.com/rust-lang/crates.io-index"
121
- checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a"
121
+ checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
122
122
  dependencies = [
123
123
  "autocfg",
124
124
  "cfg-if",
@@ -129,18 +129,18 @@ dependencies = [
129
129
 
130
130
  [[package]]
131
131
  name = "crossbeam-utils"
132
- version = "0.8.14"
132
+ version = "0.8.15"
133
133
  source = "registry+https://github.com/rust-lang/crates.io-index"
134
- checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
134
+ checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
135
135
  dependencies = [
136
136
  "cfg-if",
137
137
  ]
138
138
 
139
139
  [[package]]
140
140
  name = "darling"
141
- version = "0.14.3"
141
+ version = "0.14.4"
142
142
  source = "registry+https://github.com/rust-lang/crates.io-index"
143
- checksum = "c0808e1bd8671fb44a113a14e13497557533369847788fa2ae912b6ebfce9fa8"
143
+ checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
144
144
  dependencies = [
145
145
  "darling_core",
146
146
  "darling_macro",
@@ -148,27 +148,27 @@ dependencies = [
148
148
 
149
149
  [[package]]
150
150
  name = "darling_core"
151
- version = "0.14.3"
151
+ version = "0.14.4"
152
152
  source = "registry+https://github.com/rust-lang/crates.io-index"
153
- checksum = "001d80444f28e193f30c2f293455da62dcf9a6b29918a4253152ae2b1de592cb"
153
+ checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
154
154
  dependencies = [
155
155
  "fnv",
156
156
  "ident_case",
157
157
  "proc-macro2",
158
158
  "quote",
159
159
  "strsim",
160
- "syn",
160
+ "syn 1.0.109",
161
161
  ]
162
162
 
163
163
  [[package]]
164
164
  name = "darling_macro"
165
- version = "0.14.3"
165
+ version = "0.14.4"
166
166
  source = "registry+https://github.com/rust-lang/crates.io-index"
167
- checksum = "b36230598a2d5de7ec1c6f51f72d8a99a9208daff41de2084d06e3fd3ea56685"
167
+ checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
168
168
  dependencies = [
169
169
  "darling_core",
170
170
  "quote",
171
- "syn",
171
+ "syn 1.0.109",
172
172
  ]
173
173
 
174
174
  [[package]]
@@ -189,7 +189,7 @@ dependencies = [
189
189
  "darling",
190
190
  "proc-macro2",
191
191
  "quote",
192
- "syn",
192
+ "syn 1.0.109",
193
193
  ]
194
194
 
195
195
  [[package]]
@@ -199,7 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
199
199
  checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
200
200
  dependencies = [
201
201
  "derive_builder_core",
202
- "syn",
202
+ "syn 1.0.109",
203
203
  ]
204
204
 
205
205
  [[package]]
@@ -231,9 +231,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
231
231
 
232
232
  [[package]]
233
233
  name = "getrandom"
234
- version = "0.2.8"
234
+ version = "0.2.9"
235
235
  source = "registry+https://github.com/rust-lang/crates.io-index"
236
- checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
236
+ checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
237
237
  dependencies = [
238
238
  "cfg-if",
239
239
  "libc",
@@ -293,9 +293,9 @@ dependencies = [
293
293
 
294
294
  [[package]]
295
295
  name = "itoa"
296
- version = "1.0.5"
296
+ version = "1.0.6"
297
297
  source = "registry+https://github.com/rust-lang/crates.io-index"
298
- checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440"
298
+ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
299
299
 
300
300
  [[package]]
301
301
  name = "lazy_static"
@@ -311,9 +311,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
311
311
 
312
312
  [[package]]
313
313
  name = "libc"
314
- version = "0.2.139"
314
+ version = "0.2.141"
315
315
  source = "registry+https://github.com/rust-lang/crates.io-index"
316
- checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
316
+ checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
317
317
 
318
318
  [[package]]
319
319
  name = "libloading"
@@ -352,9 +352,9 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
352
352
 
353
353
  [[package]]
354
354
  name = "magnus"
355
- version = "0.5.0"
355
+ version = "0.5.3"
356
356
  source = "registry+https://github.com/rust-lang/crates.io-index"
357
- checksum = "af37419a942477f606d227d0e6e92f3b68458bfc68fec3bc2629df6a2c1ccdf9"
357
+ checksum = "c8dc14463c2552e753ef562961f486ca76f17a857c121db40e9f3ade3f35ab81"
358
358
  dependencies = [
359
359
  "magnus-macros",
360
360
  "rb-sys",
@@ -363,13 +363,13 @@ dependencies = [
363
363
 
364
364
  [[package]]
365
365
  name = "magnus-macros"
366
- version = "0.4.0"
366
+ version = "0.4.1"
367
367
  source = "registry+https://github.com/rust-lang/crates.io-index"
368
- checksum = "85aa71c9891b2732ff1157e1860a1ee578459fd25811fd3d72cc6e32b3fbdfea"
368
+ checksum = "6cc17af1d45442c011aa579d727ec6cff8a69aea8a6bbad26736e7112d749bfb"
369
369
  dependencies = [
370
370
  "proc-macro2",
371
371
  "quote",
372
- "syn",
372
+ "syn 1.0.109",
373
373
  ]
374
374
 
375
375
  [[package]]
@@ -380,9 +380,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
380
380
 
381
381
  [[package]]
382
382
  name = "memoffset"
383
- version = "0.7.1"
383
+ version = "0.8.0"
384
384
  source = "registry+https://github.com/rust-lang/crates.io-index"
385
- checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
385
+ checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
386
386
  dependencies = [
387
387
  "autocfg",
388
388
  ]
@@ -393,6 +393,27 @@ version = "0.2.1"
393
393
  source = "registry+https://github.com/rust-lang/crates.io-index"
394
394
  checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
395
395
 
396
+ [[package]]
397
+ name = "monostate"
398
+ version = "0.1.6"
399
+ source = "registry+https://github.com/rust-lang/crates.io-index"
400
+ checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
401
+ dependencies = [
402
+ "monostate-impl",
403
+ "serde",
404
+ ]
405
+
406
+ [[package]]
407
+ name = "monostate-impl"
408
+ version = "0.1.6"
409
+ source = "registry+https://github.com/rust-lang/crates.io-index"
410
+ checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
411
+ dependencies = [
412
+ "proc-macro2",
413
+ "quote",
414
+ "syn 2.0.13",
415
+ ]
416
+
396
417
  [[package]]
397
418
  name = "nom"
398
419
  version = "7.1.3"
@@ -421,9 +442,9 @@ checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
421
442
 
422
443
  [[package]]
423
444
  name = "once_cell"
424
- version = "1.17.0"
445
+ version = "1.17.1"
425
446
  source = "registry+https://github.com/rust-lang/crates.io-index"
426
- checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
447
+ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
427
448
 
428
449
  [[package]]
429
450
  name = "onig"
@@ -449,9 +470,9 @@ dependencies = [
449
470
 
450
471
  [[package]]
451
472
  name = "paste"
452
- version = "1.0.11"
473
+ version = "1.0.12"
453
474
  source = "registry+https://github.com/rust-lang/crates.io-index"
454
- checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba"
475
+ checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
455
476
 
456
477
  [[package]]
457
478
  name = "peeking_take_while"
@@ -473,18 +494,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
473
494
 
474
495
  [[package]]
475
496
  name = "proc-macro2"
476
- version = "1.0.51"
497
+ version = "1.0.56"
477
498
  source = "registry+https://github.com/rust-lang/crates.io-index"
478
- checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
499
+ checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
479
500
  dependencies = [
480
501
  "unicode-ident",
481
502
  ]
482
503
 
483
504
  [[package]]
484
505
  name = "quote"
485
- version = "1.0.23"
506
+ version = "1.0.26"
486
507
  source = "registry+https://github.com/rust-lang/crates.io-index"
487
- checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
508
+ checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
488
509
  dependencies = [
489
510
  "proc-macro2",
490
511
  ]
@@ -521,9 +542,9 @@ dependencies = [
521
542
 
522
543
  [[package]]
523
544
  name = "rayon"
524
- version = "1.6.1"
545
+ version = "1.7.0"
525
546
  source = "registry+https://github.com/rust-lang/crates.io-index"
526
- checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7"
547
+ checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
527
548
  dependencies = [
528
549
  "either",
529
550
  "rayon-core",
@@ -542,9 +563,9 @@ dependencies = [
542
563
 
543
564
  [[package]]
544
565
  name = "rayon-core"
545
- version = "1.10.2"
566
+ version = "1.11.0"
546
567
  source = "registry+https://github.com/rust-lang/crates.io-index"
547
- checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
568
+ checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
548
569
  dependencies = [
549
570
  "crossbeam-channel",
550
571
  "crossbeam-deque",
@@ -554,25 +575,26 @@ dependencies = [
554
575
 
555
576
  [[package]]
556
577
  name = "rb-sys"
557
- version = "0.9.65"
578
+ version = "0.9.71"
558
579
  source = "registry+https://github.com/rust-lang/crates.io-index"
559
- checksum = "e8fe617bad8e88fd7e5d6f432e35f09e5f94144dfb8e8ee4adde82fb920dc59b"
580
+ checksum = "156bfedced1e236600bcaad538477097ff2ed5c6b474e411d15b791e1d24c0f1"
560
581
  dependencies = [
561
582
  "rb-sys-build",
562
583
  ]
563
584
 
564
585
  [[package]]
565
586
  name = "rb-sys-build"
566
- version = "0.9.65"
587
+ version = "0.9.71"
567
588
  source = "registry+https://github.com/rust-lang/crates.io-index"
568
- checksum = "007e63597f91c711cbb299e60fecbdb6f5ad4a066d6a20c81943893f1584c895"
589
+ checksum = "5cb2e4a32cbc290b543a74567072ad24b708aff7bb5dde5a68d5690379cd7938"
569
590
  dependencies = [
570
591
  "bindgen",
571
592
  "lazy_static",
593
+ "proc-macro2",
572
594
  "quote",
573
595
  "regex",
574
596
  "shell-words",
575
- "syn",
597
+ "syn 1.0.109",
576
598
  ]
577
599
 
578
600
  [[package]]
@@ -583,9 +605,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
583
605
 
584
606
  [[package]]
585
607
  name = "regex"
586
- version = "1.7.1"
608
+ version = "1.7.3"
587
609
  source = "registry+https://github.com/rust-lang/crates.io-index"
588
- checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
610
+ checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
589
611
  dependencies = [
590
612
  "aho-corasick",
591
613
  "memchr",
@@ -594,9 +616,9 @@ dependencies = [
594
616
 
595
617
  [[package]]
596
618
  name = "regex-syntax"
597
- version = "0.6.28"
619
+ version = "0.6.29"
598
620
  source = "registry+https://github.com/rust-lang/crates.io-index"
599
- checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
621
+ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
600
622
 
601
623
  [[package]]
602
624
  name = "rustc-hash"
@@ -606,9 +628,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
606
628
 
607
629
  [[package]]
608
630
  name = "ryu"
609
- version = "1.0.12"
631
+ version = "1.0.13"
610
632
  source = "registry+https://github.com/rust-lang/crates.io-index"
611
- checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
633
+ checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
612
634
 
613
635
  [[package]]
614
636
  name = "scopeguard"
@@ -618,29 +640,29 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
618
640
 
619
641
  [[package]]
620
642
  name = "serde"
621
- version = "1.0.152"
643
+ version = "1.0.159"
622
644
  source = "registry+https://github.com/rust-lang/crates.io-index"
623
- checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
645
+ checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
624
646
  dependencies = [
625
647
  "serde_derive",
626
648
  ]
627
649
 
628
650
  [[package]]
629
651
  name = "serde_derive"
630
- version = "1.0.152"
652
+ version = "1.0.159"
631
653
  source = "registry+https://github.com/rust-lang/crates.io-index"
632
- checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
654
+ checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
633
655
  dependencies = [
634
656
  "proc-macro2",
635
657
  "quote",
636
- "syn",
658
+ "syn 2.0.13",
637
659
  ]
638
660
 
639
661
  [[package]]
640
662
  name = "serde_json"
641
- version = "1.0.92"
663
+ version = "1.0.95"
642
664
  source = "registry+https://github.com/rust-lang/crates.io-index"
643
- checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a"
665
+ checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
644
666
  dependencies = [
645
667
  "itoa",
646
668
  "ryu",
@@ -685,9 +707,20 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
685
707
 
686
708
  [[package]]
687
709
  name = "syn"
688
- version = "1.0.107"
710
+ version = "1.0.109"
689
711
  source = "registry+https://github.com/rust-lang/crates.io-index"
690
- checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
712
+ checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
713
+ dependencies = [
714
+ "proc-macro2",
715
+ "quote",
716
+ "unicode-ident",
717
+ ]
718
+
719
+ [[package]]
720
+ name = "syn"
721
+ version = "2.0.13"
722
+ source = "registry+https://github.com/rust-lang/crates.io-index"
723
+ checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec"
691
724
  dependencies = [
692
725
  "proc-macro2",
693
726
  "quote",
@@ -696,38 +729,39 @@ dependencies = [
696
729
 
697
730
  [[package]]
698
731
  name = "thiserror"
699
- version = "1.0.38"
732
+ version = "1.0.40"
700
733
  source = "registry+https://github.com/rust-lang/crates.io-index"
701
- checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
734
+ checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
702
735
  dependencies = [
703
736
  "thiserror-impl",
704
737
  ]
705
738
 
706
739
  [[package]]
707
740
  name = "thiserror-impl"
708
- version = "1.0.38"
741
+ version = "1.0.40"
709
742
  source = "registry+https://github.com/rust-lang/crates.io-index"
710
- checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
743
+ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
711
744
  dependencies = [
712
745
  "proc-macro2",
713
746
  "quote",
714
- "syn",
747
+ "syn 2.0.13",
715
748
  ]
716
749
 
717
750
  [[package]]
718
751
  name = "tokenizers"
719
- version = "0.3.1"
752
+ version = "0.3.3"
720
753
  dependencies = [
721
754
  "magnus",
722
755
  "onig",
723
756
  "serde",
724
- "tokenizers 0.13.2",
757
+ "tokenizers 0.13.3",
725
758
  ]
726
759
 
727
760
  [[package]]
728
761
  name = "tokenizers"
729
- version = "0.13.2"
730
- source = "git+https://github.com/huggingface/tokenizers#fa66caf0abff16bae2213658ffa3e969c5445750"
762
+ version = "0.13.3"
763
+ source = "registry+https://github.com/rust-lang/crates.io-index"
764
+ checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
731
765
  dependencies = [
732
766
  "aho-corasick",
733
767
  "derive_builder",
@@ -738,6 +772,7 @@ dependencies = [
738
772
  "lazy_static",
739
773
  "log",
740
774
  "macro_rules_attribute",
775
+ "monostate",
741
776
  "onig",
742
777
  "paste",
743
778
  "rand",
@@ -756,9 +791,9 @@ dependencies = [
756
791
 
757
792
  [[package]]
758
793
  name = "unicode-ident"
759
- version = "1.0.6"
794
+ version = "1.0.8"
760
795
  source = "registry+https://github.com/rust-lang/crates.io-index"
761
- checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
796
+ checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
762
797
 
763
798
  [[package]]
764
799
  name = "unicode-normalization-alignments"
@@ -832,42 +867,42 @@ dependencies = [
832
867
 
833
868
  [[package]]
834
869
  name = "windows_aarch64_gnullvm"
835
- version = "0.42.1"
870
+ version = "0.42.2"
836
871
  source = "registry+https://github.com/rust-lang/crates.io-index"
837
- checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
872
+ checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
838
873
 
839
874
  [[package]]
840
875
  name = "windows_aarch64_msvc"
841
- version = "0.42.1"
876
+ version = "0.42.2"
842
877
  source = "registry+https://github.com/rust-lang/crates.io-index"
843
- checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
878
+ checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
844
879
 
845
880
  [[package]]
846
881
  name = "windows_i686_gnu"
847
- version = "0.42.1"
882
+ version = "0.42.2"
848
883
  source = "registry+https://github.com/rust-lang/crates.io-index"
849
- checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
884
+ checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
850
885
 
851
886
  [[package]]
852
887
  name = "windows_i686_msvc"
853
- version = "0.42.1"
888
+ version = "0.42.2"
854
889
  source = "registry+https://github.com/rust-lang/crates.io-index"
855
- checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
890
+ checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
856
891
 
857
892
  [[package]]
858
893
  name = "windows_x86_64_gnu"
859
- version = "0.42.1"
894
+ version = "0.42.2"
860
895
  source = "registry+https://github.com/rust-lang/crates.io-index"
861
- checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
896
+ checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
862
897
 
863
898
  [[package]]
864
899
  name = "windows_x86_64_gnullvm"
865
- version = "0.42.1"
900
+ version = "0.42.2"
866
901
  source = "registry+https://github.com/rust-lang/crates.io-index"
867
- checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
902
+ checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
868
903
 
869
904
  [[package]]
870
905
  name = "windows_x86_64_msvc"
871
- version = "0.42.1"
906
+ version = "0.42.2"
872
907
  source = "registry+https://github.com/rust-lang/crates.io-index"
873
- checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
908
+ checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.3.1"
3
+ version = "0.3.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -11,11 +11,10 @@ crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
13
  magnus = "0.5"
14
- onig = { version = "6.0", default-features = false }
15
- serde = { version = "1.0", features = ["rc", "derive"] }
14
+ onig = { version = "6", default-features = false }
15
+ serde = { version = "1", features = ["rc", "derive"] }
16
16
 
17
17
  [dependencies.tokenizers]
18
- version = "0.13.2" # also update in from_pretrained.rb
19
- git = "https://github.com/huggingface/tokenizers"
18
+ version = "=0.13.3" # also update in from_pretrained.rb
20
19
  default-features = false
21
20
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -7,14 +7,19 @@ use magnus::{
7
7
  };
8
8
  use serde::{Deserialize, Serialize};
9
9
  use tk::decoders::bpe::BPEDecoder;
10
+ use tk::decoders::byte_fallback::ByteFallback;
10
11
  use tk::decoders::byte_level::ByteLevel;
11
12
  use tk::decoders::ctc::CTC;
13
+ use tk::decoders::fuse::Fuse;
12
14
  use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::strip::Strip;
13
16
  use tk::decoders::wordpiece::WordPiece;
14
17
  use tk::decoders::DecoderWrapper;
15
18
  use tk::Decoder;
19
+ use tk::normalizers::replace::Replace;
16
20
 
17
- use super::RbResult;
21
+ use super::utils::*;
22
+ use super::{RbError, RbResult};
18
23
 
19
24
  #[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
20
25
  pub struct RbDecoder {
@@ -89,6 +94,30 @@ impl RbDecoder {
89
94
  setter!(self, CTC, word_delimiter_token, word_delimiter_token);
90
95
  }
91
96
 
97
+ fn strip_content(&self) -> char {
98
+ getter!(self, Strip, content)
99
+ }
100
+
101
+ fn strip_set_content(&self, content: char) {
102
+ setter!(self, Strip, content, content)
103
+ }
104
+
105
+ fn strip_start(&self) -> usize {
106
+ getter!(self, Strip, start)
107
+ }
108
+
109
+ fn strip_set_start(&self, start: usize) {
110
+ setter!(self, Strip, start, start)
111
+ }
112
+
113
+ fn strip_stop(&self) -> usize {
114
+ getter!(self, Strip, stop)
115
+ }
116
+
117
+ fn strip_set_stop(&self, stop: usize) {
118
+ setter!(self, Strip, stop, stop)
119
+ }
120
+
92
121
  pub fn metaspace_replacement(&self) -> char {
93
122
  getter!(self, Metaspace, get_replacement().clone())
94
123
  }
@@ -130,6 +159,14 @@ impl RbBPEDecoder {
130
159
  }
131
160
  }
132
161
 
162
+ pub struct RbByteFallbackDecoder {}
163
+
164
+ impl RbByteFallbackDecoder {
165
+ pub fn new() -> RbDecoder {
166
+ ByteFallback::default().into()
167
+ }
168
+ }
169
+
133
170
  pub struct RbByteLevelDecoder {}
134
171
 
135
172
  impl RbByteLevelDecoder {
@@ -146,6 +183,14 @@ impl RbCTC {
146
183
  }
147
184
  }
148
185
 
186
+ pub struct RbFuse {}
187
+
188
+ impl RbFuse {
189
+ pub fn new() -> RbDecoder {
190
+ Fuse::default().into()
191
+ }
192
+ }
193
+
149
194
  pub struct RbMetaspaceDecoder {}
150
195
 
151
196
  impl RbMetaspaceDecoder {
@@ -154,6 +199,22 @@ impl RbMetaspaceDecoder {
154
199
  }
155
200
  }
156
201
 
202
+ pub struct RbReplaceDecoder {}
203
+
204
+ impl RbReplaceDecoder {
205
+ pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
206
+ Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
207
+ }
208
+ }
209
+
210
+ pub struct RbStripDecoder {}
211
+
212
+ impl RbStripDecoder {
213
+ pub fn new(content: char, start: usize, stop: usize) -> RbDecoder {
214
+ Strip::new(content, start, stop).into()
215
+ }
216
+ }
217
+
157
218
  pub struct RbWordPieceDecoder {}
158
219
 
159
220
  impl RbWordPieceDecoder {
@@ -219,6 +280,11 @@ unsafe impl TypedData for RbDecoder {
219
280
  class.undef_alloc_func();
220
281
  class
221
282
  }),
283
+ DecoderWrapper::ByteFallback(_) => *memoize!(RClass: {
284
+ let class: RClass = crate::decoders().const_get("ByteFallback").unwrap();
285
+ class.undef_alloc_func();
286
+ class
287
+ }),
222
288
  DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
223
289
  let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
224
290
  class.undef_alloc_func();
@@ -229,11 +295,26 @@ unsafe impl TypedData for RbDecoder {
229
295
  class.undef_alloc_func();
230
296
  class
231
297
  }),
298
+ DecoderWrapper::Fuse(_) => *memoize!(RClass: {
299
+ let class: RClass = crate::decoders().const_get("Fuse").unwrap();
300
+ class.undef_alloc_func();
301
+ class
302
+ }),
232
303
  DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
233
304
  let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
234
305
  class.undef_alloc_func();
235
306
  class
236
307
  }),
308
+ DecoderWrapper::Replace(_) => *memoize!(RClass: {
309
+ let class: RClass = crate::decoders().const_get("Replace").unwrap();
310
+ class.undef_alloc_func();
311
+ class
312
+ }),
313
+ DecoderWrapper::Strip(_) => *memoize!(RClass: {
314
+ let class: RClass = crate::decoders().const_get("Strip").unwrap();
315
+ class.undef_alloc_func();
316
+ class
317
+ }),
237
318
  DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
238
319
  let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
239
320
  class.undef_alloc_func();
@@ -253,6 +334,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
253
334
  class.define_method("suffix", method!(RbDecoder::bpe_suffix, 0))?;
254
335
  class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
255
336
 
337
+ let class = module.define_class("ByteFallback", decoder)?;
338
+ class.define_singleton_method("new", function!(RbByteFallbackDecoder::new, 0))?;
339
+
256
340
  let class = module.define_class("ByteLevel", decoder)?;
257
341
  class.define_singleton_method("new", function!(RbByteLevelDecoder::new, 0))?;
258
342
 
@@ -265,6 +349,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
265
349
  class.define_method("word_delimiter_token", method!(RbDecoder::ctc_word_delimiter_token, 0))?;
266
350
  class.define_method("word_delimiter_token=", method!(RbDecoder::ctc_set_word_delimiter_token, 1))?;
267
351
 
352
+ let class = module.define_class("Fuse", decoder)?;
353
+ class.define_singleton_method("new", function!(RbFuse::new, 0))?;
354
+
268
355
  let class = module.define_class("Metaspace", decoder)?;
269
356
  class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
270
357
  class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
@@ -272,6 +359,18 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
272
359
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
273
360
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
274
361
 
362
+ let class = module.define_class("Replace", decoder)?;
363
+ class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
364
+
365
+ let class = module.define_class("Strip", decoder)?;
366
+ class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
367
+ class.define_method("content", method!(RbDecoder::strip_content, 0))?;
368
+ class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
369
+ class.define_method("start", method!(RbDecoder::strip_start, 0))?;
370
+ class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
371
+ class.define_method("stop", method!(RbDecoder::strip_stop, 0))?;
372
+ class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
373
+
275
374
  let class = module.define_class("WordPiece", decoder)?;
276
375
  class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
277
376
  class.define_method("cleanup", method!(RbDecoder::word_piece_cleanup, 0))?;
@@ -101,6 +101,11 @@ impl RbBPE {
101
101
  builder = builder.fuse_unk(value.try_convert()?);
102
102
  }
103
103
 
104
+ let value: Value = kwargs.delete(Symbol::new("byte_fallback"))?;
105
+ if !value.is_nil() {
106
+ builder = builder.byte_fallback(value.try_convert()?);
107
+ }
108
+
104
109
  if !kwargs.is_empty() {
105
110
  // TODO improve message
106
111
  return Err(Error::new(exception::arg_error(), "unknown keyword"));
@@ -169,6 +174,14 @@ impl RbModel {
169
174
  setter!(self, BPE, fuse_unk, fuse_unk);
170
175
  }
171
176
 
177
+ pub fn bpe_byte_fallback(&self) -> bool {
178
+ getter!(self, BPE, byte_fallback)
179
+ }
180
+
181
+ pub fn bpe_set_byte_fallback(&self, byte_fallback: bool) {
182
+ setter!(self, BPE, byte_fallback, byte_fallback);
183
+ }
184
+
172
185
  pub fn bpe_continuing_subword_prefix(&self) -> Option<String> {
173
186
  getter!(self, BPE, continuing_subword_prefix.clone())
174
187
  }
@@ -355,6 +368,8 @@ pub fn models(module: &RModule) -> RbResult<()> {
355
368
  class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
356
369
  class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
357
370
  class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
371
+ class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
372
+ class.define_method("byte_fallback=", method!(RbModel::bpe_set_byte_fallback, 1))?;
358
373
 
359
374
  let class = module.define_class("Unigram", model)?;
360
375
  class.define_singleton_method("_new", function!(RbUnigram::new, 2))?;
@@ -8,7 +8,7 @@ use magnus::{
8
8
  use serde::ser::SerializeStruct;
9
9
  use serde::{Deserialize, Serialize, Serializer};
10
10
  use tk::normalizers::{
11
- BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Strip, StripAccents,
11
+ BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
12
12
  NFC, NFD, NFKC, NFKD,
13
13
  };
14
14
  use tk::{NormalizedString, Normalizer};
@@ -44,7 +44,7 @@ macro_rules! getter {
44
44
  ($self: ident, $variant: ident, $name: ident) => {{
45
45
  if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
46
46
  let wrapper = norm.read().unwrap();
47
- if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper {
47
+ if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
48
48
  o.$name
49
49
  } else {
50
50
  unreachable!()
@@ -105,6 +105,14 @@ impl RbNormalizer {
105
105
  setter!(self, BertNormalizer, lowercase, lowercase)
106
106
  }
107
107
 
108
+ fn prepend_prepend(&self) -> String {
109
+ getter!(self, Prepend, prepend)
110
+ }
111
+
112
+ fn prepend_set_prepend(&self, prepend: String) {
113
+ setter!(self, Prepend, prepend, prepend)
114
+ }
115
+
108
116
  fn strip_left(&self) -> bool {
109
117
  getter!(self, StripNormalizer, strip_left)
110
118
  }
@@ -186,6 +194,14 @@ impl RbReplace {
186
194
  }
187
195
  }
188
196
 
197
+ pub struct RbPrepend {}
198
+
199
+ impl RbPrepend {
200
+ pub fn new(prepend: String) -> RbNormalizer {
201
+ Prepend::new(prepend).into()
202
+ }
203
+ }
204
+
189
205
  pub struct RbStrip {}
190
206
 
191
207
  impl RbStrip {
@@ -372,6 +388,11 @@ unsafe impl TypedData for RbNormalizer {
372
388
  class.undef_alloc_func();
373
389
  class
374
390
  }),
391
+ NormalizerWrapper::Prepend(_) => *memoize!(RClass: {
392
+ let class: RClass = crate::normalizers().const_get("Prepend").unwrap();
393
+ class.undef_alloc_func();
394
+ class
395
+ }),
375
396
  NormalizerWrapper::StripNormalizer(_) => *memoize!(RClass: {
376
397
  let class: RClass = crate::normalizers().const_get("Strip").unwrap();
377
398
  class.undef_alloc_func();
@@ -428,6 +449,11 @@ pub fn normalizers(module: &RModule) -> RbResult<()> {
428
449
  let class = module.define_class("Replace", normalizer)?;
429
450
  class.define_singleton_method("new", function!(RbReplace::new, 2))?;
430
451
 
452
+ let class = module.define_class("Prepend", normalizer)?;
453
+ class.define_singleton_method("_new", function!(RbPrepend::new, 1))?;
454
+ class.define_method("prepend", method!(RbNormalizer::prepend_prepend, 0))?;
455
+ class.define_method("prepend=", method!(RbNormalizer::prepend_set_prepend, 1))?;
456
+
431
457
  let class = module.define_class("Strip", normalizer)?;
432
458
  class.define_singleton_method("_new", function!(RbStrip::new, 2))?;
433
459
  class.define_method("left", method!(RbNormalizer::strip_left, 0))?;
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class Strip
4
+ def self.new(content: " ", start: 0, stop: 0)
5
+ _new(content, start, stop)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.13.2"
4
+ TOKENIZERS_VERSION = "0.13.3"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class Prepend
4
+ def self.new(prepend: "▁")
5
+ _new(prepend)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.3.2"
2
+ VERSION = "0.3.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -9,6 +9,7 @@ end
9
9
  require_relative "tokenizers/decoders/bpe_decoder"
10
10
  require_relative "tokenizers/decoders/ctc"
11
11
  require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/strip"
12
13
  require_relative "tokenizers/decoders/word_piece"
13
14
 
14
15
  # models
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
19
20
 
20
21
  # normalizers
21
22
  require_relative "tokenizers/normalizers/bert_normalizer"
23
+ require_relative "tokenizers/normalizers/prepend"
22
24
  require_relative "tokenizers/normalizers/strip"
23
25
 
24
26
  # pre-tokenizers
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-07 00:00:00.000000000 Z
11
+ date: 2023-04-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,6 +56,7 @@ files:
56
56
  - lib/tokenizers/decoders/bpe_decoder.rb
57
57
  - lib/tokenizers/decoders/ctc.rb
58
58
  - lib/tokenizers/decoders/metaspace.rb
59
+ - lib/tokenizers/decoders/strip.rb
59
60
  - lib/tokenizers/decoders/word_piece.rb
60
61
  - lib/tokenizers/encoding.rb
61
62
  - lib/tokenizers/from_pretrained.rb
@@ -64,6 +65,7 @@ files:
64
65
  - lib/tokenizers/models/word_level.rb
65
66
  - lib/tokenizers/models/word_piece.rb
66
67
  - lib/tokenizers/normalizers/bert_normalizer.rb
68
+ - lib/tokenizers/normalizers/prepend.rb
67
69
  - lib/tokenizers/normalizers/strip.rb
68
70
  - lib/tokenizers/pre_tokenizers/byte_level.rb
69
71
  - lib/tokenizers/pre_tokenizers/digits.rb
@@ -98,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
100
  - !ruby/object:Gem::Version
99
101
  version: '0'
100
102
  requirements: []
101
- rubygems_version: 3.4.6
103
+ rubygems_version: 3.4.10
102
104
  signing_key:
103
105
  specification_version: 4
104
106
  summary: Fast state-of-the-art tokenizers for Ruby