tokenizers 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 11fdc53989bc2285096bc5eb4a971426e153375ada43bc8c51d2a191b42fe02d
4
- data.tar.gz: 92d4d8bef2c4013d5cf0d55bff30bc070ca6e59430f87b96f9147075216a1c1d
3
+ metadata.gz: e6e88ec5618e36e317434410c960603695806bb59dadb2252f2957d8dbf0525b
4
+ data.tar.gz: 33a04a4a5faada27e6e7246c16d836a4ff9f6793e89de3cfd4880e30c6c8ed0d
5
5
  SHA512:
6
- metadata.gz: 7c91c33078b6c5b23a6080908fa184e2222922f35bd6e1c439329525f4554f939cf98ca4dcd3017d858cdb6ec2a96f6671ce5dc425ca643d482b49b04af00f4c
7
- data.tar.gz: 74aad6c458792570ace93107aacccb8a9df7b61228dd062bd7bcaf015a53d4f6b4b1fcace09c69bc959bd9b04598d315344d4edc7e6abbe1603f3b92d29ab711
6
+ metadata.gz: 88e4f2ad57fd1d66cd5fcf0d8b7ff6b1ea902258296fb02d207a446032134189e3445a104658074e94f914331c94f46cfdd09eed7c745c0483cb3b32b09e6abf
7
+ data.tar.gz: e8a1721ecbd36874322477077331743b0d1ba2de6f90076e07ad5456c230f76625d7f28ed6e6026c11395c6bb27701a6b8c0feedf2050387d32d9b777baa51fe
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.3.3 (2023-04-09)
2
+
3
+ - Updated Tokenizers to 0.13.3
4
+ - Added `ByteFallback`, `Fuse`, `Replace`, and `Strip` decoders
5
+ - Added `Prepend` normalizer
6
+
1
7
  ## 0.3.2 (2023-03-06)
2
8
 
3
9
  - Added precompiled gem for Linux x86-64 MUSL
data/Cargo.lock CHANGED
@@ -71,9 +71,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
71
71
 
72
72
  [[package]]
73
73
  name = "clang-sys"
74
- version = "1.4.0"
74
+ version = "1.6.1"
75
75
  source = "registry+https://github.com/rust-lang/crates.io-index"
76
- checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3"
76
+ checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
77
77
  dependencies = [
78
78
  "glob",
79
79
  "libc",
@@ -95,9 +95,9 @@ dependencies = [
95
95
 
96
96
  [[package]]
97
97
  name = "crossbeam-channel"
98
- version = "0.5.6"
98
+ version = "0.5.8"
99
99
  source = "registry+https://github.com/rust-lang/crates.io-index"
100
- checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
100
+ checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
101
101
  dependencies = [
102
102
  "cfg-if",
103
103
  "crossbeam-utils",
@@ -105,9 +105,9 @@ dependencies = [
105
105
 
106
106
  [[package]]
107
107
  name = "crossbeam-deque"
108
- version = "0.8.2"
108
+ version = "0.8.3"
109
109
  source = "registry+https://github.com/rust-lang/crates.io-index"
110
- checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
110
+ checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
111
111
  dependencies = [
112
112
  "cfg-if",
113
113
  "crossbeam-epoch",
@@ -116,9 +116,9 @@ dependencies = [
116
116
 
117
117
  [[package]]
118
118
  name = "crossbeam-epoch"
119
- version = "0.9.13"
119
+ version = "0.9.14"
120
120
  source = "registry+https://github.com/rust-lang/crates.io-index"
121
- checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a"
121
+ checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
122
122
  dependencies = [
123
123
  "autocfg",
124
124
  "cfg-if",
@@ -129,18 +129,18 @@ dependencies = [
129
129
 
130
130
  [[package]]
131
131
  name = "crossbeam-utils"
132
- version = "0.8.14"
132
+ version = "0.8.15"
133
133
  source = "registry+https://github.com/rust-lang/crates.io-index"
134
- checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
134
+ checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
135
135
  dependencies = [
136
136
  "cfg-if",
137
137
  ]
138
138
 
139
139
  [[package]]
140
140
  name = "darling"
141
- version = "0.14.3"
141
+ version = "0.14.4"
142
142
  source = "registry+https://github.com/rust-lang/crates.io-index"
143
- checksum = "c0808e1bd8671fb44a113a14e13497557533369847788fa2ae912b6ebfce9fa8"
143
+ checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
144
144
  dependencies = [
145
145
  "darling_core",
146
146
  "darling_macro",
@@ -148,27 +148,27 @@ dependencies = [
148
148
 
149
149
  [[package]]
150
150
  name = "darling_core"
151
- version = "0.14.3"
151
+ version = "0.14.4"
152
152
  source = "registry+https://github.com/rust-lang/crates.io-index"
153
- checksum = "001d80444f28e193f30c2f293455da62dcf9a6b29918a4253152ae2b1de592cb"
153
+ checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
154
154
  dependencies = [
155
155
  "fnv",
156
156
  "ident_case",
157
157
  "proc-macro2",
158
158
  "quote",
159
159
  "strsim",
160
- "syn",
160
+ "syn 1.0.109",
161
161
  ]
162
162
 
163
163
  [[package]]
164
164
  name = "darling_macro"
165
- version = "0.14.3"
165
+ version = "0.14.4"
166
166
  source = "registry+https://github.com/rust-lang/crates.io-index"
167
- checksum = "b36230598a2d5de7ec1c6f51f72d8a99a9208daff41de2084d06e3fd3ea56685"
167
+ checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
168
168
  dependencies = [
169
169
  "darling_core",
170
170
  "quote",
171
- "syn",
171
+ "syn 1.0.109",
172
172
  ]
173
173
 
174
174
  [[package]]
@@ -189,7 +189,7 @@ dependencies = [
189
189
  "darling",
190
190
  "proc-macro2",
191
191
  "quote",
192
- "syn",
192
+ "syn 1.0.109",
193
193
  ]
194
194
 
195
195
  [[package]]
@@ -199,7 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
199
199
  checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
200
200
  dependencies = [
201
201
  "derive_builder_core",
202
- "syn",
202
+ "syn 1.0.109",
203
203
  ]
204
204
 
205
205
  [[package]]
@@ -231,9 +231,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
231
231
 
232
232
  [[package]]
233
233
  name = "getrandom"
234
- version = "0.2.8"
234
+ version = "0.2.9"
235
235
  source = "registry+https://github.com/rust-lang/crates.io-index"
236
- checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
236
+ checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
237
237
  dependencies = [
238
238
  "cfg-if",
239
239
  "libc",
@@ -293,9 +293,9 @@ dependencies = [
293
293
 
294
294
  [[package]]
295
295
  name = "itoa"
296
- version = "1.0.5"
296
+ version = "1.0.6"
297
297
  source = "registry+https://github.com/rust-lang/crates.io-index"
298
- checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440"
298
+ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
299
299
 
300
300
  [[package]]
301
301
  name = "lazy_static"
@@ -311,9 +311,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
311
311
 
312
312
  [[package]]
313
313
  name = "libc"
314
- version = "0.2.139"
314
+ version = "0.2.141"
315
315
  source = "registry+https://github.com/rust-lang/crates.io-index"
316
- checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
316
+ checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
317
317
 
318
318
  [[package]]
319
319
  name = "libloading"
@@ -352,9 +352,9 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
352
352
 
353
353
  [[package]]
354
354
  name = "magnus"
355
- version = "0.5.0"
355
+ version = "0.5.3"
356
356
  source = "registry+https://github.com/rust-lang/crates.io-index"
357
- checksum = "af37419a942477f606d227d0e6e92f3b68458bfc68fec3bc2629df6a2c1ccdf9"
357
+ checksum = "c8dc14463c2552e753ef562961f486ca76f17a857c121db40e9f3ade3f35ab81"
358
358
  dependencies = [
359
359
  "magnus-macros",
360
360
  "rb-sys",
@@ -363,13 +363,13 @@ dependencies = [
363
363
 
364
364
  [[package]]
365
365
  name = "magnus-macros"
366
- version = "0.4.0"
366
+ version = "0.4.1"
367
367
  source = "registry+https://github.com/rust-lang/crates.io-index"
368
- checksum = "85aa71c9891b2732ff1157e1860a1ee578459fd25811fd3d72cc6e32b3fbdfea"
368
+ checksum = "6cc17af1d45442c011aa579d727ec6cff8a69aea8a6bbad26736e7112d749bfb"
369
369
  dependencies = [
370
370
  "proc-macro2",
371
371
  "quote",
372
- "syn",
372
+ "syn 1.0.109",
373
373
  ]
374
374
 
375
375
  [[package]]
@@ -380,9 +380,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
380
380
 
381
381
  [[package]]
382
382
  name = "memoffset"
383
- version = "0.7.1"
383
+ version = "0.8.0"
384
384
  source = "registry+https://github.com/rust-lang/crates.io-index"
385
- checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
385
+ checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
386
386
  dependencies = [
387
387
  "autocfg",
388
388
  ]
@@ -393,6 +393,27 @@ version = "0.2.1"
393
393
  source = "registry+https://github.com/rust-lang/crates.io-index"
394
394
  checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
395
395
 
396
+ [[package]]
397
+ name = "monostate"
398
+ version = "0.1.6"
399
+ source = "registry+https://github.com/rust-lang/crates.io-index"
400
+ checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
401
+ dependencies = [
402
+ "monostate-impl",
403
+ "serde",
404
+ ]
405
+
406
+ [[package]]
407
+ name = "monostate-impl"
408
+ version = "0.1.6"
409
+ source = "registry+https://github.com/rust-lang/crates.io-index"
410
+ checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
411
+ dependencies = [
412
+ "proc-macro2",
413
+ "quote",
414
+ "syn 2.0.13",
415
+ ]
416
+
396
417
  [[package]]
397
418
  name = "nom"
398
419
  version = "7.1.3"
@@ -421,9 +442,9 @@ checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
421
442
 
422
443
  [[package]]
423
444
  name = "once_cell"
424
- version = "1.17.0"
445
+ version = "1.17.1"
425
446
  source = "registry+https://github.com/rust-lang/crates.io-index"
426
- checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
447
+ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
427
448
 
428
449
  [[package]]
429
450
  name = "onig"
@@ -449,9 +470,9 @@ dependencies = [
449
470
 
450
471
  [[package]]
451
472
  name = "paste"
452
- version = "1.0.11"
473
+ version = "1.0.12"
453
474
  source = "registry+https://github.com/rust-lang/crates.io-index"
454
- checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba"
475
+ checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
455
476
 
456
477
  [[package]]
457
478
  name = "peeking_take_while"
@@ -473,18 +494,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
473
494
 
474
495
  [[package]]
475
496
  name = "proc-macro2"
476
- version = "1.0.51"
497
+ version = "1.0.56"
477
498
  source = "registry+https://github.com/rust-lang/crates.io-index"
478
- checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
499
+ checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
479
500
  dependencies = [
480
501
  "unicode-ident",
481
502
  ]
482
503
 
483
504
  [[package]]
484
505
  name = "quote"
485
- version = "1.0.23"
506
+ version = "1.0.26"
486
507
  source = "registry+https://github.com/rust-lang/crates.io-index"
487
- checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
508
+ checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
488
509
  dependencies = [
489
510
  "proc-macro2",
490
511
  ]
@@ -521,9 +542,9 @@ dependencies = [
521
542
 
522
543
  [[package]]
523
544
  name = "rayon"
524
- version = "1.6.1"
545
+ version = "1.7.0"
525
546
  source = "registry+https://github.com/rust-lang/crates.io-index"
526
- checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7"
547
+ checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
527
548
  dependencies = [
528
549
  "either",
529
550
  "rayon-core",
@@ -542,9 +563,9 @@ dependencies = [
542
563
 
543
564
  [[package]]
544
565
  name = "rayon-core"
545
- version = "1.10.2"
566
+ version = "1.11.0"
546
567
  source = "registry+https://github.com/rust-lang/crates.io-index"
547
- checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
568
+ checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
548
569
  dependencies = [
549
570
  "crossbeam-channel",
550
571
  "crossbeam-deque",
@@ -554,25 +575,26 @@ dependencies = [
554
575
 
555
576
  [[package]]
556
577
  name = "rb-sys"
557
- version = "0.9.65"
578
+ version = "0.9.71"
558
579
  source = "registry+https://github.com/rust-lang/crates.io-index"
559
- checksum = "e8fe617bad8e88fd7e5d6f432e35f09e5f94144dfb8e8ee4adde82fb920dc59b"
580
+ checksum = "156bfedced1e236600bcaad538477097ff2ed5c6b474e411d15b791e1d24c0f1"
560
581
  dependencies = [
561
582
  "rb-sys-build",
562
583
  ]
563
584
 
564
585
  [[package]]
565
586
  name = "rb-sys-build"
566
- version = "0.9.65"
587
+ version = "0.9.71"
567
588
  source = "registry+https://github.com/rust-lang/crates.io-index"
568
- checksum = "007e63597f91c711cbb299e60fecbdb6f5ad4a066d6a20c81943893f1584c895"
589
+ checksum = "5cb2e4a32cbc290b543a74567072ad24b708aff7bb5dde5a68d5690379cd7938"
569
590
  dependencies = [
570
591
  "bindgen",
571
592
  "lazy_static",
593
+ "proc-macro2",
572
594
  "quote",
573
595
  "regex",
574
596
  "shell-words",
575
- "syn",
597
+ "syn 1.0.109",
576
598
  ]
577
599
 
578
600
  [[package]]
@@ -583,9 +605,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
583
605
 
584
606
  [[package]]
585
607
  name = "regex"
586
- version = "1.7.1"
608
+ version = "1.7.3"
587
609
  source = "registry+https://github.com/rust-lang/crates.io-index"
588
- checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
610
+ checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
589
611
  dependencies = [
590
612
  "aho-corasick",
591
613
  "memchr",
@@ -594,9 +616,9 @@ dependencies = [
594
616
 
595
617
  [[package]]
596
618
  name = "regex-syntax"
597
- version = "0.6.28"
619
+ version = "0.6.29"
598
620
  source = "registry+https://github.com/rust-lang/crates.io-index"
599
- checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
621
+ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
600
622
 
601
623
  [[package]]
602
624
  name = "rustc-hash"
@@ -606,9 +628,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
606
628
 
607
629
  [[package]]
608
630
  name = "ryu"
609
- version = "1.0.12"
631
+ version = "1.0.13"
610
632
  source = "registry+https://github.com/rust-lang/crates.io-index"
611
- checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
633
+ checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
612
634
 
613
635
  [[package]]
614
636
  name = "scopeguard"
@@ -618,29 +640,29 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
618
640
 
619
641
  [[package]]
620
642
  name = "serde"
621
- version = "1.0.152"
643
+ version = "1.0.159"
622
644
  source = "registry+https://github.com/rust-lang/crates.io-index"
623
- checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
645
+ checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
624
646
  dependencies = [
625
647
  "serde_derive",
626
648
  ]
627
649
 
628
650
  [[package]]
629
651
  name = "serde_derive"
630
- version = "1.0.152"
652
+ version = "1.0.159"
631
653
  source = "registry+https://github.com/rust-lang/crates.io-index"
632
- checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
654
+ checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
633
655
  dependencies = [
634
656
  "proc-macro2",
635
657
  "quote",
636
- "syn",
658
+ "syn 2.0.13",
637
659
  ]
638
660
 
639
661
  [[package]]
640
662
  name = "serde_json"
641
- version = "1.0.92"
663
+ version = "1.0.95"
642
664
  source = "registry+https://github.com/rust-lang/crates.io-index"
643
- checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a"
665
+ checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
644
666
  dependencies = [
645
667
  "itoa",
646
668
  "ryu",
@@ -685,9 +707,20 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
685
707
 
686
708
  [[package]]
687
709
  name = "syn"
688
- version = "1.0.107"
710
+ version = "1.0.109"
689
711
  source = "registry+https://github.com/rust-lang/crates.io-index"
690
- checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
712
+ checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
713
+ dependencies = [
714
+ "proc-macro2",
715
+ "quote",
716
+ "unicode-ident",
717
+ ]
718
+
719
+ [[package]]
720
+ name = "syn"
721
+ version = "2.0.13"
722
+ source = "registry+https://github.com/rust-lang/crates.io-index"
723
+ checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec"
691
724
  dependencies = [
692
725
  "proc-macro2",
693
726
  "quote",
@@ -696,38 +729,39 @@ dependencies = [
696
729
 
697
730
  [[package]]
698
731
  name = "thiserror"
699
- version = "1.0.38"
732
+ version = "1.0.40"
700
733
  source = "registry+https://github.com/rust-lang/crates.io-index"
701
- checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
734
+ checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
702
735
  dependencies = [
703
736
  "thiserror-impl",
704
737
  ]
705
738
 
706
739
  [[package]]
707
740
  name = "thiserror-impl"
708
- version = "1.0.38"
741
+ version = "1.0.40"
709
742
  source = "registry+https://github.com/rust-lang/crates.io-index"
710
- checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
743
+ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
711
744
  dependencies = [
712
745
  "proc-macro2",
713
746
  "quote",
714
- "syn",
747
+ "syn 2.0.13",
715
748
  ]
716
749
 
717
750
  [[package]]
718
751
  name = "tokenizers"
719
- version = "0.3.1"
752
+ version = "0.3.3"
720
753
  dependencies = [
721
754
  "magnus",
722
755
  "onig",
723
756
  "serde",
724
- "tokenizers 0.13.2",
757
+ "tokenizers 0.13.3",
725
758
  ]
726
759
 
727
760
  [[package]]
728
761
  name = "tokenizers"
729
- version = "0.13.2"
730
- source = "git+https://github.com/huggingface/tokenizers#fa66caf0abff16bae2213658ffa3e969c5445750"
762
+ version = "0.13.3"
763
+ source = "registry+https://github.com/rust-lang/crates.io-index"
764
+ checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
731
765
  dependencies = [
732
766
  "aho-corasick",
733
767
  "derive_builder",
@@ -738,6 +772,7 @@ dependencies = [
738
772
  "lazy_static",
739
773
  "log",
740
774
  "macro_rules_attribute",
775
+ "monostate",
741
776
  "onig",
742
777
  "paste",
743
778
  "rand",
@@ -756,9 +791,9 @@ dependencies = [
756
791
 
757
792
  [[package]]
758
793
  name = "unicode-ident"
759
- version = "1.0.6"
794
+ version = "1.0.8"
760
795
  source = "registry+https://github.com/rust-lang/crates.io-index"
761
- checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
796
+ checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
762
797
 
763
798
  [[package]]
764
799
  name = "unicode-normalization-alignments"
@@ -832,42 +867,42 @@ dependencies = [
832
867
 
833
868
  [[package]]
834
869
  name = "windows_aarch64_gnullvm"
835
- version = "0.42.1"
870
+ version = "0.42.2"
836
871
  source = "registry+https://github.com/rust-lang/crates.io-index"
837
- checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
872
+ checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
838
873
 
839
874
  [[package]]
840
875
  name = "windows_aarch64_msvc"
841
- version = "0.42.1"
876
+ version = "0.42.2"
842
877
  source = "registry+https://github.com/rust-lang/crates.io-index"
843
- checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
878
+ checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
844
879
 
845
880
  [[package]]
846
881
  name = "windows_i686_gnu"
847
- version = "0.42.1"
882
+ version = "0.42.2"
848
883
  source = "registry+https://github.com/rust-lang/crates.io-index"
849
- checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
884
+ checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
850
885
 
851
886
  [[package]]
852
887
  name = "windows_i686_msvc"
853
- version = "0.42.1"
888
+ version = "0.42.2"
854
889
  source = "registry+https://github.com/rust-lang/crates.io-index"
855
- checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
890
+ checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
856
891
 
857
892
  [[package]]
858
893
  name = "windows_x86_64_gnu"
859
- version = "0.42.1"
894
+ version = "0.42.2"
860
895
  source = "registry+https://github.com/rust-lang/crates.io-index"
861
- checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
896
+ checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
862
897
 
863
898
  [[package]]
864
899
  name = "windows_x86_64_gnullvm"
865
- version = "0.42.1"
900
+ version = "0.42.2"
866
901
  source = "registry+https://github.com/rust-lang/crates.io-index"
867
- checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
902
+ checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
868
903
 
869
904
  [[package]]
870
905
  name = "windows_x86_64_msvc"
871
- version = "0.42.1"
906
+ version = "0.42.2"
872
907
  source = "registry+https://github.com/rust-lang/crates.io-index"
873
- checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
908
+ checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.3.1"
3
+ version = "0.3.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -11,11 +11,10 @@ crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
13
  magnus = "0.5"
14
- onig = { version = "6.0", default-features = false }
15
- serde = { version = "1.0", features = ["rc", "derive"] }
14
+ onig = { version = "6", default-features = false }
15
+ serde = { version = "1", features = ["rc", "derive"] }
16
16
 
17
17
  [dependencies.tokenizers]
18
- version = "0.13.2" # also update in from_pretrained.rb
19
- git = "https://github.com/huggingface/tokenizers"
18
+ version = "=0.13.3" # also update in from_pretrained.rb
20
19
  default-features = false
21
20
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -7,14 +7,19 @@ use magnus::{
7
7
  };
8
8
  use serde::{Deserialize, Serialize};
9
9
  use tk::decoders::bpe::BPEDecoder;
10
+ use tk::decoders::byte_fallback::ByteFallback;
10
11
  use tk::decoders::byte_level::ByteLevel;
11
12
  use tk::decoders::ctc::CTC;
13
+ use tk::decoders::fuse::Fuse;
12
14
  use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::strip::Strip;
13
16
  use tk::decoders::wordpiece::WordPiece;
14
17
  use tk::decoders::DecoderWrapper;
15
18
  use tk::Decoder;
19
+ use tk::normalizers::replace::Replace;
16
20
 
17
- use super::RbResult;
21
+ use super::utils::*;
22
+ use super::{RbError, RbResult};
18
23
 
19
24
  #[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
20
25
  pub struct RbDecoder {
@@ -89,6 +94,30 @@ impl RbDecoder {
89
94
  setter!(self, CTC, word_delimiter_token, word_delimiter_token);
90
95
  }
91
96
 
97
+ fn strip_content(&self) -> char {
98
+ getter!(self, Strip, content)
99
+ }
100
+
101
+ fn strip_set_content(&self, content: char) {
102
+ setter!(self, Strip, content, content)
103
+ }
104
+
105
+ fn strip_start(&self) -> usize {
106
+ getter!(self, Strip, start)
107
+ }
108
+
109
+ fn strip_set_start(&self, start: usize) {
110
+ setter!(self, Strip, start, start)
111
+ }
112
+
113
+ fn strip_stop(&self) -> usize {
114
+ getter!(self, Strip, stop)
115
+ }
116
+
117
+ fn strip_set_stop(&self, stop: usize) {
118
+ setter!(self, Strip, stop, stop)
119
+ }
120
+
92
121
  pub fn metaspace_replacement(&self) -> char {
93
122
  getter!(self, Metaspace, get_replacement().clone())
94
123
  }
@@ -130,6 +159,14 @@ impl RbBPEDecoder {
130
159
  }
131
160
  }
132
161
 
162
+ pub struct RbByteFallbackDecoder {}
163
+
164
+ impl RbByteFallbackDecoder {
165
+ pub fn new() -> RbDecoder {
166
+ ByteFallback::default().into()
167
+ }
168
+ }
169
+
133
170
  pub struct RbByteLevelDecoder {}
134
171
 
135
172
  impl RbByteLevelDecoder {
@@ -146,6 +183,14 @@ impl RbCTC {
146
183
  }
147
184
  }
148
185
 
186
+ pub struct RbFuse {}
187
+
188
+ impl RbFuse {
189
+ pub fn new() -> RbDecoder {
190
+ Fuse::default().into()
191
+ }
192
+ }
193
+
149
194
  pub struct RbMetaspaceDecoder {}
150
195
 
151
196
  impl RbMetaspaceDecoder {
@@ -154,6 +199,22 @@ impl RbMetaspaceDecoder {
154
199
  }
155
200
  }
156
201
 
202
+ pub struct RbReplaceDecoder {}
203
+
204
+ impl RbReplaceDecoder {
205
+ pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
206
+ Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
207
+ }
208
+ }
209
+
210
+ pub struct RbStripDecoder {}
211
+
212
+ impl RbStripDecoder {
213
+ pub fn new(content: char, start: usize, stop: usize) -> RbDecoder {
214
+ Strip::new(content, start, stop).into()
215
+ }
216
+ }
217
+
157
218
  pub struct RbWordPieceDecoder {}
158
219
 
159
220
  impl RbWordPieceDecoder {
@@ -219,6 +280,11 @@ unsafe impl TypedData for RbDecoder {
219
280
  class.undef_alloc_func();
220
281
  class
221
282
  }),
283
+ DecoderWrapper::ByteFallback(_) => *memoize!(RClass: {
284
+ let class: RClass = crate::decoders().const_get("ByteFallback").unwrap();
285
+ class.undef_alloc_func();
286
+ class
287
+ }),
222
288
  DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
223
289
  let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
224
290
  class.undef_alloc_func();
@@ -229,11 +295,26 @@ unsafe impl TypedData for RbDecoder {
229
295
  class.undef_alloc_func();
230
296
  class
231
297
  }),
298
+ DecoderWrapper::Fuse(_) => *memoize!(RClass: {
299
+ let class: RClass = crate::decoders().const_get("Fuse").unwrap();
300
+ class.undef_alloc_func();
301
+ class
302
+ }),
232
303
  DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
233
304
  let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
234
305
  class.undef_alloc_func();
235
306
  class
236
307
  }),
308
+ DecoderWrapper::Replace(_) => *memoize!(RClass: {
309
+ let class: RClass = crate::decoders().const_get("Replace").unwrap();
310
+ class.undef_alloc_func();
311
+ class
312
+ }),
313
+ DecoderWrapper::Strip(_) => *memoize!(RClass: {
314
+ let class: RClass = crate::decoders().const_get("Strip").unwrap();
315
+ class.undef_alloc_func();
316
+ class
317
+ }),
237
318
  DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
238
319
  let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
239
320
  class.undef_alloc_func();
@@ -253,6 +334,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
253
334
  class.define_method("suffix", method!(RbDecoder::bpe_suffix, 0))?;
254
335
  class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
255
336
 
337
+ let class = module.define_class("ByteFallback", decoder)?;
338
+ class.define_singleton_method("new", function!(RbByteFallbackDecoder::new, 0))?;
339
+
256
340
  let class = module.define_class("ByteLevel", decoder)?;
257
341
  class.define_singleton_method("new", function!(RbByteLevelDecoder::new, 0))?;
258
342
 
@@ -265,6 +349,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
265
349
  class.define_method("word_delimiter_token", method!(RbDecoder::ctc_word_delimiter_token, 0))?;
266
350
  class.define_method("word_delimiter_token=", method!(RbDecoder::ctc_set_word_delimiter_token, 1))?;
267
351
 
352
+ let class = module.define_class("Fuse", decoder)?;
353
+ class.define_singleton_method("new", function!(RbFuse::new, 0))?;
354
+
268
355
  let class = module.define_class("Metaspace", decoder)?;
269
356
  class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
270
357
  class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
@@ -272,6 +359,18 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
272
359
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
273
360
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
274
361
 
362
+ let class = module.define_class("Replace", decoder)?;
363
+ class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
364
+
365
+ let class = module.define_class("Strip", decoder)?;
366
+ class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
367
+ class.define_method("content", method!(RbDecoder::strip_content, 0))?;
368
+ class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
369
+ class.define_method("start", method!(RbDecoder::strip_start, 0))?;
370
+ class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
371
+ class.define_method("stop", method!(RbDecoder::strip_stop, 0))?;
372
+ class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
373
+
275
374
  let class = module.define_class("WordPiece", decoder)?;
276
375
  class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
277
376
  class.define_method("cleanup", method!(RbDecoder::word_piece_cleanup, 0))?;
@@ -101,6 +101,11 @@ impl RbBPE {
101
101
  builder = builder.fuse_unk(value.try_convert()?);
102
102
  }
103
103
 
104
+ let value: Value = kwargs.delete(Symbol::new("byte_fallback"))?;
105
+ if !value.is_nil() {
106
+ builder = builder.byte_fallback(value.try_convert()?);
107
+ }
108
+
104
109
  if !kwargs.is_empty() {
105
110
  // TODO improve message
106
111
  return Err(Error::new(exception::arg_error(), "unknown keyword"));
@@ -169,6 +174,14 @@ impl RbModel {
169
174
  setter!(self, BPE, fuse_unk, fuse_unk);
170
175
  }
171
176
 
177
+ pub fn bpe_byte_fallback(&self) -> bool {
178
+ getter!(self, BPE, byte_fallback)
179
+ }
180
+
181
+ pub fn bpe_set_byte_fallback(&self, byte_fallback: bool) {
182
+ setter!(self, BPE, byte_fallback, byte_fallback);
183
+ }
184
+
172
185
  pub fn bpe_continuing_subword_prefix(&self) -> Option<String> {
173
186
  getter!(self, BPE, continuing_subword_prefix.clone())
174
187
  }
@@ -355,6 +368,8 @@ pub fn models(module: &RModule) -> RbResult<()> {
355
368
  class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
356
369
  class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
357
370
  class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
371
+ class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
372
+ class.define_method("byte_fallback=", method!(RbModel::bpe_set_byte_fallback, 1))?;
358
373
 
359
374
  let class = module.define_class("Unigram", model)?;
360
375
  class.define_singleton_method("_new", function!(RbUnigram::new, 2))?;
@@ -8,7 +8,7 @@ use magnus::{
8
8
  use serde::ser::SerializeStruct;
9
9
  use serde::{Deserialize, Serialize, Serializer};
10
10
  use tk::normalizers::{
11
- BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Strip, StripAccents,
11
+ BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
12
12
  NFC, NFD, NFKC, NFKD,
13
13
  };
14
14
  use tk::{NormalizedString, Normalizer};
@@ -44,7 +44,7 @@ macro_rules! getter {
44
44
  ($self: ident, $variant: ident, $name: ident) => {{
45
45
  if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
46
46
  let wrapper = norm.read().unwrap();
47
- if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper {
47
+ if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
48
48
  o.$name
49
49
  } else {
50
50
  unreachable!()
@@ -105,6 +105,14 @@ impl RbNormalizer {
105
105
  setter!(self, BertNormalizer, lowercase, lowercase)
106
106
  }
107
107
 
108
+ fn prepend_prepend(&self) -> String {
109
+ getter!(self, Prepend, prepend)
110
+ }
111
+
112
+ fn prepend_set_prepend(&self, prepend: String) {
113
+ setter!(self, Prepend, prepend, prepend)
114
+ }
115
+
108
116
  fn strip_left(&self) -> bool {
109
117
  getter!(self, StripNormalizer, strip_left)
110
118
  }
@@ -186,6 +194,14 @@ impl RbReplace {
186
194
  }
187
195
  }
188
196
 
197
+ pub struct RbPrepend {}
198
+
199
+ impl RbPrepend {
200
+ pub fn new(prepend: String) -> RbNormalizer {
201
+ Prepend::new(prepend).into()
202
+ }
203
+ }
204
+
189
205
  pub struct RbStrip {}
190
206
 
191
207
  impl RbStrip {
@@ -372,6 +388,11 @@ unsafe impl TypedData for RbNormalizer {
372
388
  class.undef_alloc_func();
373
389
  class
374
390
  }),
391
+ NormalizerWrapper::Prepend(_) => *memoize!(RClass: {
392
+ let class: RClass = crate::normalizers().const_get("Prepend").unwrap();
393
+ class.undef_alloc_func();
394
+ class
395
+ }),
375
396
  NormalizerWrapper::StripNormalizer(_) => *memoize!(RClass: {
376
397
  let class: RClass = crate::normalizers().const_get("Strip").unwrap();
377
398
  class.undef_alloc_func();
@@ -428,6 +449,11 @@ pub fn normalizers(module: &RModule) -> RbResult<()> {
428
449
  let class = module.define_class("Replace", normalizer)?;
429
450
  class.define_singleton_method("new", function!(RbReplace::new, 2))?;
430
451
 
452
+ let class = module.define_class("Prepend", normalizer)?;
453
+ class.define_singleton_method("_new", function!(RbPrepend::new, 1))?;
454
+ class.define_method("prepend", method!(RbNormalizer::prepend_prepend, 0))?;
455
+ class.define_method("prepend=", method!(RbNormalizer::prepend_set_prepend, 1))?;
456
+
431
457
  let class = module.define_class("Strip", normalizer)?;
432
458
  class.define_singleton_method("_new", function!(RbStrip::new, 2))?;
433
459
  class.define_method("left", method!(RbNormalizer::strip_left, 0))?;
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class Strip
4
+ def self.new(content: " ", start: 0, stop: 0)
5
+ _new(content, start, stop)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.13.2"
4
+ TOKENIZERS_VERSION = "0.13.3"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class Prepend
4
+ def self.new(prepend: "▁")
5
+ _new(prepend)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.3.2"
2
+ VERSION = "0.3.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -9,6 +9,7 @@ end
9
9
  require_relative "tokenizers/decoders/bpe_decoder"
10
10
  require_relative "tokenizers/decoders/ctc"
11
11
  require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/strip"
12
13
  require_relative "tokenizers/decoders/word_piece"
13
14
 
14
15
  # models
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
19
20
 
20
21
  # normalizers
21
22
  require_relative "tokenizers/normalizers/bert_normalizer"
23
+ require_relative "tokenizers/normalizers/prepend"
22
24
  require_relative "tokenizers/normalizers/strip"
23
25
 
24
26
  # pre-tokenizers
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-07 00:00:00.000000000 Z
11
+ date: 2023-04-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,6 +56,7 @@ files:
56
56
  - lib/tokenizers/decoders/bpe_decoder.rb
57
57
  - lib/tokenizers/decoders/ctc.rb
58
58
  - lib/tokenizers/decoders/metaspace.rb
59
+ - lib/tokenizers/decoders/strip.rb
59
60
  - lib/tokenizers/decoders/word_piece.rb
60
61
  - lib/tokenizers/encoding.rb
61
62
  - lib/tokenizers/from_pretrained.rb
@@ -64,6 +65,7 @@ files:
64
65
  - lib/tokenizers/models/word_level.rb
65
66
  - lib/tokenizers/models/word_piece.rb
66
67
  - lib/tokenizers/normalizers/bert_normalizer.rb
68
+ - lib/tokenizers/normalizers/prepend.rb
67
69
  - lib/tokenizers/normalizers/strip.rb
68
70
  - lib/tokenizers/pre_tokenizers/byte_level.rb
69
71
  - lib/tokenizers/pre_tokenizers/digits.rb
@@ -98,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
100
  - !ruby/object:Gem::Version
99
101
  version: '0'
100
102
  requirements: []
101
- rubygems_version: 3.4.6
103
+ rubygems_version: 3.4.10
102
104
  signing_key:
103
105
  specification_version: 4
104
106
  summary: Fast state-of-the-art tokenizers for Ruby