tokenizers 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b298d2a912a3d7fe32b9dc60036822b39ff463ad9739fec9ca00809922c6f45c
4
- data.tar.gz: 8491e0c8532d9eefe5ae1c0940f57ae3986480fb9460897d7612e8937d97e384
3
+ metadata.gz: e6e88ec5618e36e317434410c960603695806bb59dadb2252f2957d8dbf0525b
4
+ data.tar.gz: 33a04a4a5faada27e6e7246c16d836a4ff9f6793e89de3cfd4880e30c6c8ed0d
5
5
  SHA512:
6
- metadata.gz: 3616d92731fe9c7e4214166392b05734b10dec0e004106c01c756cab544491523b596b5d0a44ef8a47708719fa32839b86a1038a31104fae6d3de3a2e17ccca9
7
- data.tar.gz: 88a375508473b97ae0b30dfb12a2c930a20477b003e5f2977a670904dab077ab4e2771b98dd3a446fc9b9c417e886f3b7355bd7eec527eb898251c9c60a0337c
6
+ metadata.gz: 88e4f2ad57fd1d66cd5fcf0d8b7ff6b1ea902258296fb02d207a446032134189e3445a104658074e94f914331c94f46cfdd09eed7c745c0483cb3b32b09e6abf
7
+ data.tar.gz: e8a1721ecbd36874322477077331743b0d1ba2de6f90076e07ad5456c230f76625d7f28ed6e6026c11395c6bb27701a6b8c0feedf2050387d32d9b777baa51fe
data/CHANGELOG.md CHANGED
@@ -1,8 +1,18 @@
1
- ## 0.3.1 (2022-02-08)
1
+ ## 0.3.3 (2023-04-09)
2
+
3
+ - Updated Tokenizers to 0.13.3
4
+ - Added `ByteFallback`, `Fuse`, `Replace`, and `Strip` decoders
5
+ - Added `Prepend` normalizer
6
+
7
+ ## 0.3.2 (2023-03-06)
8
+
9
+ - Added precompiled gem for Linux x86-64 MUSL
10
+
11
+ ## 0.3.1 (2023-02-08)
2
12
 
3
13
  - Fixed error with Ruby 2.7
4
14
 
5
- ## 0.3.0 (2022-02-07)
15
+ ## 0.3.0 (2023-02-07)
6
16
 
7
17
  - Added support for training tokenizers
8
18
  - Added more methods to `Tokenizer`
@@ -11,20 +21,20 @@
11
21
  - Changed `encode` method to include special tokens by default
12
22
  - Changed how offsets are calculated for strings with multibyte characters
13
23
 
14
- ## 0.2.3 (2022-01-22)
24
+ ## 0.2.3 (2023-01-22)
15
25
 
16
26
  - Added `add_special_tokens` option to `encode` method
17
27
  - Added warning about `encode` method including special tokens by default in 0.3.0
18
28
  - Added more methods to `Encoding`
19
29
  - Fixed error with precompiled gem on Mac ARM
20
30
 
21
- ## 0.2.2 (2022-01-15)
31
+ ## 0.2.2 (2023-01-15)
22
32
 
23
33
  - Added precompiled gem for Linux ARM
24
34
  - Added `from_file` method
25
35
  - Fixed error with precompiled gem on Linux x86-64
26
36
 
27
- ## 0.2.1 (2022-01-12)
37
+ ## 0.2.1 (2023-01-12)
28
38
 
29
39
  - Added support for Ruby 3.2
30
40
 
data/Cargo.lock CHANGED
@@ -71,9 +71,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
71
71
 
72
72
  [[package]]
73
73
  name = "clang-sys"
74
- version = "1.4.0"
74
+ version = "1.6.1"
75
75
  source = "registry+https://github.com/rust-lang/crates.io-index"
76
- checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3"
76
+ checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
77
77
  dependencies = [
78
78
  "glob",
79
79
  "libc",
@@ -95,9 +95,9 @@ dependencies = [
95
95
 
96
96
  [[package]]
97
97
  name = "crossbeam-channel"
98
- version = "0.5.6"
98
+ version = "0.5.8"
99
99
  source = "registry+https://github.com/rust-lang/crates.io-index"
100
- checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
100
+ checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
101
101
  dependencies = [
102
102
  "cfg-if",
103
103
  "crossbeam-utils",
@@ -105,9 +105,9 @@ dependencies = [
105
105
 
106
106
  [[package]]
107
107
  name = "crossbeam-deque"
108
- version = "0.8.2"
108
+ version = "0.8.3"
109
109
  source = "registry+https://github.com/rust-lang/crates.io-index"
110
- checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
110
+ checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
111
111
  dependencies = [
112
112
  "cfg-if",
113
113
  "crossbeam-epoch",
@@ -116,9 +116,9 @@ dependencies = [
116
116
 
117
117
  [[package]]
118
118
  name = "crossbeam-epoch"
119
- version = "0.9.13"
119
+ version = "0.9.14"
120
120
  source = "registry+https://github.com/rust-lang/crates.io-index"
121
- checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a"
121
+ checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
122
122
  dependencies = [
123
123
  "autocfg",
124
124
  "cfg-if",
@@ -129,18 +129,18 @@ dependencies = [
129
129
 
130
130
  [[package]]
131
131
  name = "crossbeam-utils"
132
- version = "0.8.14"
132
+ version = "0.8.15"
133
133
  source = "registry+https://github.com/rust-lang/crates.io-index"
134
- checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
134
+ checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
135
135
  dependencies = [
136
136
  "cfg-if",
137
137
  ]
138
138
 
139
139
  [[package]]
140
140
  name = "darling"
141
- version = "0.14.3"
141
+ version = "0.14.4"
142
142
  source = "registry+https://github.com/rust-lang/crates.io-index"
143
- checksum = "c0808e1bd8671fb44a113a14e13497557533369847788fa2ae912b6ebfce9fa8"
143
+ checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
144
144
  dependencies = [
145
145
  "darling_core",
146
146
  "darling_macro",
@@ -148,27 +148,27 @@ dependencies = [
148
148
 
149
149
  [[package]]
150
150
  name = "darling_core"
151
- version = "0.14.3"
151
+ version = "0.14.4"
152
152
  source = "registry+https://github.com/rust-lang/crates.io-index"
153
- checksum = "001d80444f28e193f30c2f293455da62dcf9a6b29918a4253152ae2b1de592cb"
153
+ checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
154
154
  dependencies = [
155
155
  "fnv",
156
156
  "ident_case",
157
157
  "proc-macro2",
158
158
  "quote",
159
159
  "strsim",
160
- "syn",
160
+ "syn 1.0.109",
161
161
  ]
162
162
 
163
163
  [[package]]
164
164
  name = "darling_macro"
165
- version = "0.14.3"
165
+ version = "0.14.4"
166
166
  source = "registry+https://github.com/rust-lang/crates.io-index"
167
- checksum = "b36230598a2d5de7ec1c6f51f72d8a99a9208daff41de2084d06e3fd3ea56685"
167
+ checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
168
168
  dependencies = [
169
169
  "darling_core",
170
170
  "quote",
171
- "syn",
171
+ "syn 1.0.109",
172
172
  ]
173
173
 
174
174
  [[package]]
@@ -189,7 +189,7 @@ dependencies = [
189
189
  "darling",
190
190
  "proc-macro2",
191
191
  "quote",
192
- "syn",
192
+ "syn 1.0.109",
193
193
  ]
194
194
 
195
195
  [[package]]
@@ -199,7 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
199
199
  checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
200
200
  dependencies = [
201
201
  "derive_builder_core",
202
- "syn",
202
+ "syn 1.0.109",
203
203
  ]
204
204
 
205
205
  [[package]]
@@ -231,9 +231,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
231
231
 
232
232
  [[package]]
233
233
  name = "getrandom"
234
- version = "0.2.8"
234
+ version = "0.2.9"
235
235
  source = "registry+https://github.com/rust-lang/crates.io-index"
236
- checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
236
+ checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
237
237
  dependencies = [
238
238
  "cfg-if",
239
239
  "libc",
@@ -293,9 +293,9 @@ dependencies = [
293
293
 
294
294
  [[package]]
295
295
  name = "itoa"
296
- version = "1.0.5"
296
+ version = "1.0.6"
297
297
  source = "registry+https://github.com/rust-lang/crates.io-index"
298
- checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440"
298
+ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
299
299
 
300
300
  [[package]]
301
301
  name = "lazy_static"
@@ -311,9 +311,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
311
311
 
312
312
  [[package]]
313
313
  name = "libc"
314
- version = "0.2.139"
314
+ version = "0.2.141"
315
315
  source = "registry+https://github.com/rust-lang/crates.io-index"
316
- checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
316
+ checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
317
317
 
318
318
  [[package]]
319
319
  name = "libloading"
@@ -352,8 +352,9 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
352
352
 
353
353
  [[package]]
354
354
  name = "magnus"
355
- version = "0.5.0"
356
- source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
355
+ version = "0.5.3"
356
+ source = "registry+https://github.com/rust-lang/crates.io-index"
357
+ checksum = "c8dc14463c2552e753ef562961f486ca76f17a857c121db40e9f3ade3f35ab81"
357
358
  dependencies = [
358
359
  "magnus-macros",
359
360
  "rb-sys",
@@ -362,12 +363,13 @@ dependencies = [
362
363
 
363
364
  [[package]]
364
365
  name = "magnus-macros"
365
- version = "0.3.0"
366
- source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
366
+ version = "0.4.1"
367
+ source = "registry+https://github.com/rust-lang/crates.io-index"
368
+ checksum = "6cc17af1d45442c011aa579d727ec6cff8a69aea8a6bbad26736e7112d749bfb"
367
369
  dependencies = [
368
370
  "proc-macro2",
369
371
  "quote",
370
- "syn",
372
+ "syn 1.0.109",
371
373
  ]
372
374
 
373
375
  [[package]]
@@ -378,9 +380,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
378
380
 
379
381
  [[package]]
380
382
  name = "memoffset"
381
- version = "0.7.1"
383
+ version = "0.8.0"
382
384
  source = "registry+https://github.com/rust-lang/crates.io-index"
383
- checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
385
+ checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
384
386
  dependencies = [
385
387
  "autocfg",
386
388
  ]
@@ -391,6 +393,27 @@ version = "0.2.1"
391
393
  source = "registry+https://github.com/rust-lang/crates.io-index"
392
394
  checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
393
395
 
396
+ [[package]]
397
+ name = "monostate"
398
+ version = "0.1.6"
399
+ source = "registry+https://github.com/rust-lang/crates.io-index"
400
+ checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
401
+ dependencies = [
402
+ "monostate-impl",
403
+ "serde",
404
+ ]
405
+
406
+ [[package]]
407
+ name = "monostate-impl"
408
+ version = "0.1.6"
409
+ source = "registry+https://github.com/rust-lang/crates.io-index"
410
+ checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
411
+ dependencies = [
412
+ "proc-macro2",
413
+ "quote",
414
+ "syn 2.0.13",
415
+ ]
416
+
394
417
  [[package]]
395
418
  name = "nom"
396
419
  version = "7.1.3"
@@ -419,9 +442,9 @@ checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
419
442
 
420
443
  [[package]]
421
444
  name = "once_cell"
422
- version = "1.17.0"
445
+ version = "1.17.1"
423
446
  source = "registry+https://github.com/rust-lang/crates.io-index"
424
- checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
447
+ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
425
448
 
426
449
  [[package]]
427
450
  name = "onig"
@@ -447,9 +470,9 @@ dependencies = [
447
470
 
448
471
  [[package]]
449
472
  name = "paste"
450
- version = "1.0.11"
473
+ version = "1.0.12"
451
474
  source = "registry+https://github.com/rust-lang/crates.io-index"
452
- checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba"
475
+ checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
453
476
 
454
477
  [[package]]
455
478
  name = "peeking_take_while"
@@ -471,18 +494,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
471
494
 
472
495
  [[package]]
473
496
  name = "proc-macro2"
474
- version = "1.0.51"
497
+ version = "1.0.56"
475
498
  source = "registry+https://github.com/rust-lang/crates.io-index"
476
- checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
499
+ checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
477
500
  dependencies = [
478
501
  "unicode-ident",
479
502
  ]
480
503
 
481
504
  [[package]]
482
505
  name = "quote"
483
- version = "1.0.23"
506
+ version = "1.0.26"
484
507
  source = "registry+https://github.com/rust-lang/crates.io-index"
485
- checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
508
+ checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
486
509
  dependencies = [
487
510
  "proc-macro2",
488
511
  ]
@@ -519,9 +542,9 @@ dependencies = [
519
542
 
520
543
  [[package]]
521
544
  name = "rayon"
522
- version = "1.6.1"
545
+ version = "1.7.0"
523
546
  source = "registry+https://github.com/rust-lang/crates.io-index"
524
- checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7"
547
+ checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
525
548
  dependencies = [
526
549
  "either",
527
550
  "rayon-core",
@@ -540,9 +563,9 @@ dependencies = [
540
563
 
541
564
  [[package]]
542
565
  name = "rayon-core"
543
- version = "1.10.2"
566
+ version = "1.11.0"
544
567
  source = "registry+https://github.com/rust-lang/crates.io-index"
545
- checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
568
+ checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
546
569
  dependencies = [
547
570
  "crossbeam-channel",
548
571
  "crossbeam-deque",
@@ -552,22 +575,26 @@ dependencies = [
552
575
 
553
576
  [[package]]
554
577
  name = "rb-sys"
555
- version = "0.9.64"
578
+ version = "0.9.71"
556
579
  source = "registry+https://github.com/rust-lang/crates.io-index"
557
- checksum = "cc8945662df8083245deda89e236647173cc7ad750f481ddcd7bbfd3afe3fa5e"
580
+ checksum = "156bfedced1e236600bcaad538477097ff2ed5c6b474e411d15b791e1d24c0f1"
558
581
  dependencies = [
559
582
  "rb-sys-build",
560
583
  ]
561
584
 
562
585
  [[package]]
563
586
  name = "rb-sys-build"
564
- version = "0.9.64"
587
+ version = "0.9.71"
565
588
  source = "registry+https://github.com/rust-lang/crates.io-index"
566
- checksum = "ae8c3cdf9edc3908ee1555b7a1bca58ee1b499439b32cd1c1ec3e66736a8df48"
589
+ checksum = "5cb2e4a32cbc290b543a74567072ad24b708aff7bb5dde5a68d5690379cd7938"
567
590
  dependencies = [
568
591
  "bindgen",
592
+ "lazy_static",
593
+ "proc-macro2",
594
+ "quote",
569
595
  "regex",
570
596
  "shell-words",
597
+ "syn 1.0.109",
571
598
  ]
572
599
 
573
600
  [[package]]
@@ -578,9 +605,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
578
605
 
579
606
  [[package]]
580
607
  name = "regex"
581
- version = "1.7.1"
608
+ version = "1.7.3"
582
609
  source = "registry+https://github.com/rust-lang/crates.io-index"
583
- checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
610
+ checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
584
611
  dependencies = [
585
612
  "aho-corasick",
586
613
  "memchr",
@@ -589,9 +616,9 @@ dependencies = [
589
616
 
590
617
  [[package]]
591
618
  name = "regex-syntax"
592
- version = "0.6.28"
619
+ version = "0.6.29"
593
620
  source = "registry+https://github.com/rust-lang/crates.io-index"
594
- checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
621
+ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
595
622
 
596
623
  [[package]]
597
624
  name = "rustc-hash"
@@ -601,9 +628,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
601
628
 
602
629
  [[package]]
603
630
  name = "ryu"
604
- version = "1.0.12"
631
+ version = "1.0.13"
605
632
  source = "registry+https://github.com/rust-lang/crates.io-index"
606
- checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
633
+ checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
607
634
 
608
635
  [[package]]
609
636
  name = "scopeguard"
@@ -613,29 +640,29 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
613
640
 
614
641
  [[package]]
615
642
  name = "serde"
616
- version = "1.0.152"
643
+ version = "1.0.159"
617
644
  source = "registry+https://github.com/rust-lang/crates.io-index"
618
- checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
645
+ checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
619
646
  dependencies = [
620
647
  "serde_derive",
621
648
  ]
622
649
 
623
650
  [[package]]
624
651
  name = "serde_derive"
625
- version = "1.0.152"
652
+ version = "1.0.159"
626
653
  source = "registry+https://github.com/rust-lang/crates.io-index"
627
- checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
654
+ checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
628
655
  dependencies = [
629
656
  "proc-macro2",
630
657
  "quote",
631
- "syn",
658
+ "syn 2.0.13",
632
659
  ]
633
660
 
634
661
  [[package]]
635
662
  name = "serde_json"
636
- version = "1.0.92"
663
+ version = "1.0.95"
637
664
  source = "registry+https://github.com/rust-lang/crates.io-index"
638
- checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a"
665
+ checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
639
666
  dependencies = [
640
667
  "itoa",
641
668
  "ryu",
@@ -680,9 +707,20 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
680
707
 
681
708
  [[package]]
682
709
  name = "syn"
683
- version = "1.0.107"
710
+ version = "1.0.109"
684
711
  source = "registry+https://github.com/rust-lang/crates.io-index"
685
- checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
712
+ checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
713
+ dependencies = [
714
+ "proc-macro2",
715
+ "quote",
716
+ "unicode-ident",
717
+ ]
718
+
719
+ [[package]]
720
+ name = "syn"
721
+ version = "2.0.13"
722
+ source = "registry+https://github.com/rust-lang/crates.io-index"
723
+ checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec"
686
724
  dependencies = [
687
725
  "proc-macro2",
688
726
  "quote",
@@ -691,38 +729,39 @@ dependencies = [
691
729
 
692
730
  [[package]]
693
731
  name = "thiserror"
694
- version = "1.0.38"
732
+ version = "1.0.40"
695
733
  source = "registry+https://github.com/rust-lang/crates.io-index"
696
- checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
734
+ checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
697
735
  dependencies = [
698
736
  "thiserror-impl",
699
737
  ]
700
738
 
701
739
  [[package]]
702
740
  name = "thiserror-impl"
703
- version = "1.0.38"
741
+ version = "1.0.40"
704
742
  source = "registry+https://github.com/rust-lang/crates.io-index"
705
- checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
743
+ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
706
744
  dependencies = [
707
745
  "proc-macro2",
708
746
  "quote",
709
- "syn",
747
+ "syn 2.0.13",
710
748
  ]
711
749
 
712
750
  [[package]]
713
751
  name = "tokenizers"
714
- version = "0.3.1"
752
+ version = "0.3.3"
715
753
  dependencies = [
716
754
  "magnus",
717
755
  "onig",
718
756
  "serde",
719
- "tokenizers 0.13.2",
757
+ "tokenizers 0.13.3",
720
758
  ]
721
759
 
722
760
  [[package]]
723
761
  name = "tokenizers"
724
- version = "0.13.2"
725
- source = "git+https://github.com/huggingface/tokenizers#fa66caf0abff16bae2213658ffa3e969c5445750"
762
+ version = "0.13.3"
763
+ source = "registry+https://github.com/rust-lang/crates.io-index"
764
+ checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
726
765
  dependencies = [
727
766
  "aho-corasick",
728
767
  "derive_builder",
@@ -733,6 +772,7 @@ dependencies = [
733
772
  "lazy_static",
734
773
  "log",
735
774
  "macro_rules_attribute",
775
+ "monostate",
736
776
  "onig",
737
777
  "paste",
738
778
  "rand",
@@ -751,9 +791,9 @@ dependencies = [
751
791
 
752
792
  [[package]]
753
793
  name = "unicode-ident"
754
- version = "1.0.6"
794
+ version = "1.0.8"
755
795
  source = "registry+https://github.com/rust-lang/crates.io-index"
756
- checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
796
+ checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
757
797
 
758
798
  [[package]]
759
799
  name = "unicode-normalization-alignments"
@@ -827,42 +867,42 @@ dependencies = [
827
867
 
828
868
  [[package]]
829
869
  name = "windows_aarch64_gnullvm"
830
- version = "0.42.1"
870
+ version = "0.42.2"
831
871
  source = "registry+https://github.com/rust-lang/crates.io-index"
832
- checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
872
+ checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
833
873
 
834
874
  [[package]]
835
875
  name = "windows_aarch64_msvc"
836
- version = "0.42.1"
876
+ version = "0.42.2"
837
877
  source = "registry+https://github.com/rust-lang/crates.io-index"
838
- checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
878
+ checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
839
879
 
840
880
  [[package]]
841
881
  name = "windows_i686_gnu"
842
- version = "0.42.1"
882
+ version = "0.42.2"
843
883
  source = "registry+https://github.com/rust-lang/crates.io-index"
844
- checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
884
+ checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
845
885
 
846
886
  [[package]]
847
887
  name = "windows_i686_msvc"
848
- version = "0.42.1"
888
+ version = "0.42.2"
849
889
  source = "registry+https://github.com/rust-lang/crates.io-index"
850
- checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
890
+ checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
851
891
 
852
892
  [[package]]
853
893
  name = "windows_x86_64_gnu"
854
- version = "0.42.1"
894
+ version = "0.42.2"
855
895
  source = "registry+https://github.com/rust-lang/crates.io-index"
856
- checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
896
+ checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
857
897
 
858
898
  [[package]]
859
899
  name = "windows_x86_64_gnullvm"
860
- version = "0.42.1"
900
+ version = "0.42.2"
861
901
  source = "registry+https://github.com/rust-lang/crates.io-index"
862
- checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
902
+ checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
863
903
 
864
904
  [[package]]
865
905
  name = "windows_x86_64_msvc"
866
- version = "0.42.1"
906
+ version = "0.42.2"
867
907
  source = "registry+https://github.com/rust-lang/crates.io-index"
868
- checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
908
+ checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.3.1"
3
+ version = "0.3.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -10,12 +10,11 @@ publish = false
10
10
  crate-type = ["cdylib"]
11
11
 
12
12
  [dependencies]
13
- magnus = { git = "https://github.com/matsadler/magnus" }
14
- onig = { version = "6.0", default-features = false }
15
- serde = { version = "1.0", features = ["rc", "derive"] }
13
+ magnus = "0.5"
14
+ onig = { version = "6", default-features = false }
15
+ serde = { version = "1", features = ["rc", "derive"] }
16
16
 
17
17
  [dependencies.tokenizers]
18
- version = "0.13.2" # also update in from_pretrained.rb
19
- git = "https://github.com/huggingface/tokenizers"
18
+ version = "=0.13.3" # also update in from_pretrained.rb
20
19
  default-features = false
21
20
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -7,14 +7,19 @@ use magnus::{
7
7
  };
8
8
  use serde::{Deserialize, Serialize};
9
9
  use tk::decoders::bpe::BPEDecoder;
10
+ use tk::decoders::byte_fallback::ByteFallback;
10
11
  use tk::decoders::byte_level::ByteLevel;
11
12
  use tk::decoders::ctc::CTC;
13
+ use tk::decoders::fuse::Fuse;
12
14
  use tk::decoders::metaspace::Metaspace;
15
+ use tk::decoders::strip::Strip;
13
16
  use tk::decoders::wordpiece::WordPiece;
14
17
  use tk::decoders::DecoderWrapper;
15
18
  use tk::Decoder;
19
+ use tk::normalizers::replace::Replace;
16
20
 
17
- use super::RbResult;
21
+ use super::utils::*;
22
+ use super::{RbError, RbResult};
18
23
 
19
24
  #[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
20
25
  pub struct RbDecoder {
@@ -89,6 +94,30 @@ impl RbDecoder {
89
94
  setter!(self, CTC, word_delimiter_token, word_delimiter_token);
90
95
  }
91
96
 
97
+ fn strip_content(&self) -> char {
98
+ getter!(self, Strip, content)
99
+ }
100
+
101
+ fn strip_set_content(&self, content: char) {
102
+ setter!(self, Strip, content, content)
103
+ }
104
+
105
+ fn strip_start(&self) -> usize {
106
+ getter!(self, Strip, start)
107
+ }
108
+
109
+ fn strip_set_start(&self, start: usize) {
110
+ setter!(self, Strip, start, start)
111
+ }
112
+
113
+ fn strip_stop(&self) -> usize {
114
+ getter!(self, Strip, stop)
115
+ }
116
+
117
+ fn strip_set_stop(&self, stop: usize) {
118
+ setter!(self, Strip, stop, stop)
119
+ }
120
+
92
121
  pub fn metaspace_replacement(&self) -> char {
93
122
  getter!(self, Metaspace, get_replacement().clone())
94
123
  }
@@ -130,6 +159,14 @@ impl RbBPEDecoder {
130
159
  }
131
160
  }
132
161
 
162
+ pub struct RbByteFallbackDecoder {}
163
+
164
+ impl RbByteFallbackDecoder {
165
+ pub fn new() -> RbDecoder {
166
+ ByteFallback::default().into()
167
+ }
168
+ }
169
+
133
170
  pub struct RbByteLevelDecoder {}
134
171
 
135
172
  impl RbByteLevelDecoder {
@@ -146,6 +183,14 @@ impl RbCTC {
146
183
  }
147
184
  }
148
185
 
186
+ pub struct RbFuse {}
187
+
188
+ impl RbFuse {
189
+ pub fn new() -> RbDecoder {
190
+ Fuse::default().into()
191
+ }
192
+ }
193
+
149
194
  pub struct RbMetaspaceDecoder {}
150
195
 
151
196
  impl RbMetaspaceDecoder {
@@ -154,6 +199,22 @@ impl RbMetaspaceDecoder {
154
199
  }
155
200
  }
156
201
 
202
+ pub struct RbReplaceDecoder {}
203
+
204
+ impl RbReplaceDecoder {
205
+ pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
206
+ Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
207
+ }
208
+ }
209
+
210
+ pub struct RbStripDecoder {}
211
+
212
+ impl RbStripDecoder {
213
+ pub fn new(content: char, start: usize, stop: usize) -> RbDecoder {
214
+ Strip::new(content, start, stop).into()
215
+ }
216
+ }
217
+
157
218
  pub struct RbWordPieceDecoder {}
158
219
 
159
220
  impl RbWordPieceDecoder {
@@ -219,6 +280,11 @@ unsafe impl TypedData for RbDecoder {
219
280
  class.undef_alloc_func();
220
281
  class
221
282
  }),
283
+ DecoderWrapper::ByteFallback(_) => *memoize!(RClass: {
284
+ let class: RClass = crate::decoders().const_get("ByteFallback").unwrap();
285
+ class.undef_alloc_func();
286
+ class
287
+ }),
222
288
  DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
223
289
  let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
224
290
  class.undef_alloc_func();
@@ -229,11 +295,26 @@ unsafe impl TypedData for RbDecoder {
229
295
  class.undef_alloc_func();
230
296
  class
231
297
  }),
298
+ DecoderWrapper::Fuse(_) => *memoize!(RClass: {
299
+ let class: RClass = crate::decoders().const_get("Fuse").unwrap();
300
+ class.undef_alloc_func();
301
+ class
302
+ }),
232
303
  DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
233
304
  let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
234
305
  class.undef_alloc_func();
235
306
  class
236
307
  }),
308
+ DecoderWrapper::Replace(_) => *memoize!(RClass: {
309
+ let class: RClass = crate::decoders().const_get("Replace").unwrap();
310
+ class.undef_alloc_func();
311
+ class
312
+ }),
313
+ DecoderWrapper::Strip(_) => *memoize!(RClass: {
314
+ let class: RClass = crate::decoders().const_get("Strip").unwrap();
315
+ class.undef_alloc_func();
316
+ class
317
+ }),
237
318
  DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
238
319
  let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
239
320
  class.undef_alloc_func();
@@ -253,6 +334,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
253
334
  class.define_method("suffix", method!(RbDecoder::bpe_suffix, 0))?;
254
335
  class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
255
336
 
337
+ let class = module.define_class("ByteFallback", decoder)?;
338
+ class.define_singleton_method("new", function!(RbByteFallbackDecoder::new, 0))?;
339
+
256
340
  let class = module.define_class("ByteLevel", decoder)?;
257
341
  class.define_singleton_method("new", function!(RbByteLevelDecoder::new, 0))?;
258
342
 
@@ -265,6 +349,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
265
349
  class.define_method("word_delimiter_token", method!(RbDecoder::ctc_word_delimiter_token, 0))?;
266
350
  class.define_method("word_delimiter_token=", method!(RbDecoder::ctc_set_word_delimiter_token, 1))?;
267
351
 
352
+ let class = module.define_class("Fuse", decoder)?;
353
+ class.define_singleton_method("new", function!(RbFuse::new, 0))?;
354
+
268
355
  let class = module.define_class("Metaspace", decoder)?;
269
356
  class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
270
357
  class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
@@ -272,6 +359,18 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
272
359
  class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
273
360
  class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
274
361
 
362
+ let class = module.define_class("Replace", decoder)?;
363
+ class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
364
+
365
+ let class = module.define_class("Strip", decoder)?;
366
+ class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
367
+ class.define_method("content", method!(RbDecoder::strip_content, 0))?;
368
+ class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
369
+ class.define_method("start", method!(RbDecoder::strip_start, 0))?;
370
+ class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
371
+ class.define_method("stop", method!(RbDecoder::strip_stop, 0))?;
372
+ class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
373
+
275
374
  let class = module.define_class("WordPiece", decoder)?;
276
375
  class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
277
376
  class.define_method("cleanup", method!(RbDecoder::word_piece_cleanup, 0))?;
@@ -101,6 +101,11 @@ impl RbBPE {
101
101
  builder = builder.fuse_unk(value.try_convert()?);
102
102
  }
103
103
 
104
+ let value: Value = kwargs.delete(Symbol::new("byte_fallback"))?;
105
+ if !value.is_nil() {
106
+ builder = builder.byte_fallback(value.try_convert()?);
107
+ }
108
+
104
109
  if !kwargs.is_empty() {
105
110
  // TODO improve message
106
111
  return Err(Error::new(exception::arg_error(), "unknown keyword"));
@@ -169,6 +174,14 @@ impl RbModel {
169
174
  setter!(self, BPE, fuse_unk, fuse_unk);
170
175
  }
171
176
 
177
+ pub fn bpe_byte_fallback(&self) -> bool {
178
+ getter!(self, BPE, byte_fallback)
179
+ }
180
+
181
+ pub fn bpe_set_byte_fallback(&self, byte_fallback: bool) {
182
+ setter!(self, BPE, byte_fallback, byte_fallback);
183
+ }
184
+
172
185
  pub fn bpe_continuing_subword_prefix(&self) -> Option<String> {
173
186
  getter!(self, BPE, continuing_subword_prefix.clone())
174
187
  }
@@ -355,6 +368,8 @@ pub fn models(module: &RModule) -> RbResult<()> {
355
368
  class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
356
369
  class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
357
370
  class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
371
+ class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
372
+ class.define_method("byte_fallback=", method!(RbModel::bpe_set_byte_fallback, 1))?;
358
373
 
359
374
  let class = module.define_class("Unigram", model)?;
360
375
  class.define_singleton_method("_new", function!(RbUnigram::new, 2))?;
@@ -8,7 +8,7 @@ use magnus::{
8
8
  use serde::ser::SerializeStruct;
9
9
  use serde::{Deserialize, Serialize, Serializer};
10
10
  use tk::normalizers::{
11
- BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Strip, StripAccents,
11
+ BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
12
12
  NFC, NFD, NFKC, NFKD,
13
13
  };
14
14
  use tk::{NormalizedString, Normalizer};
@@ -44,7 +44,7 @@ macro_rules! getter {
44
44
  ($self: ident, $variant: ident, $name: ident) => {{
45
45
  if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
46
46
  let wrapper = norm.read().unwrap();
47
- if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper {
47
+ if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
48
48
  o.$name
49
49
  } else {
50
50
  unreachable!()
@@ -105,6 +105,14 @@ impl RbNormalizer {
105
105
  setter!(self, BertNormalizer, lowercase, lowercase)
106
106
  }
107
107
 
108
+ fn prepend_prepend(&self) -> String {
109
+ getter!(self, Prepend, prepend)
110
+ }
111
+
112
+ fn prepend_set_prepend(&self, prepend: String) {
113
+ setter!(self, Prepend, prepend, prepend)
114
+ }
115
+
108
116
  fn strip_left(&self) -> bool {
109
117
  getter!(self, StripNormalizer, strip_left)
110
118
  }
@@ -186,6 +194,14 @@ impl RbReplace {
186
194
  }
187
195
  }
188
196
 
197
+ pub struct RbPrepend {}
198
+
199
+ impl RbPrepend {
200
+ pub fn new(prepend: String) -> RbNormalizer {
201
+ Prepend::new(prepend).into()
202
+ }
203
+ }
204
+
189
205
  pub struct RbStrip {}
190
206
 
191
207
  impl RbStrip {
@@ -372,6 +388,11 @@ unsafe impl TypedData for RbNormalizer {
372
388
  class.undef_alloc_func();
373
389
  class
374
390
  }),
391
+ NormalizerWrapper::Prepend(_) => *memoize!(RClass: {
392
+ let class: RClass = crate::normalizers().const_get("Prepend").unwrap();
393
+ class.undef_alloc_func();
394
+ class
395
+ }),
375
396
  NormalizerWrapper::StripNormalizer(_) => *memoize!(RClass: {
376
397
  let class: RClass = crate::normalizers().const_get("Strip").unwrap();
377
398
  class.undef_alloc_func();
@@ -428,6 +449,11 @@ pub fn normalizers(module: &RModule) -> RbResult<()> {
428
449
  let class = module.define_class("Replace", normalizer)?;
429
450
  class.define_singleton_method("new", function!(RbReplace::new, 2))?;
430
451
 
452
+ let class = module.define_class("Prepend", normalizer)?;
453
+ class.define_singleton_method("_new", function!(RbPrepend::new, 1))?;
454
+ class.define_method("prepend", method!(RbNormalizer::prepend_prepend, 0))?;
455
+ class.define_method("prepend=", method!(RbNormalizer::prepend_set_prepend, 1))?;
456
+
431
457
  let class = module.define_class("Strip", normalizer)?;
432
458
  class.define_singleton_method("_new", function!(RbStrip::new, 2))?;
433
459
  class.define_method("left", method!(RbNormalizer::strip_left, 0))?;
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Decoders
3
+ class Strip
4
+ def self.new(content: " ", start: 0, stop: 0)
5
+ _new(content, start, stop)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.13.2"
4
+ TOKENIZERS_VERSION = "0.13.3"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -0,0 +1,9 @@
1
+ module Tokenizers
2
+ module Normalizers
3
+ class Prepend
4
+ def self.new(prepend: "▁")
5
+ _new(prepend)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.3.1"
2
+ VERSION = "0.3.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -9,6 +9,7 @@ end
9
9
  require_relative "tokenizers/decoders/bpe_decoder"
10
10
  require_relative "tokenizers/decoders/ctc"
11
11
  require_relative "tokenizers/decoders/metaspace"
12
+ require_relative "tokenizers/decoders/strip"
12
13
  require_relative "tokenizers/decoders/word_piece"
13
14
 
14
15
  # models
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
19
20
 
20
21
  # normalizers
21
22
  require_relative "tokenizers/normalizers/bert_normalizer"
23
+ require_relative "tokenizers/normalizers/prepend"
22
24
  require_relative "tokenizers/normalizers/strip"
23
25
 
24
26
  # pre-tokenizers
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-09 00:00:00.000000000 Z
11
+ date: 2023-04-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,6 +56,7 @@ files:
56
56
  - lib/tokenizers/decoders/bpe_decoder.rb
57
57
  - lib/tokenizers/decoders/ctc.rb
58
58
  - lib/tokenizers/decoders/metaspace.rb
59
+ - lib/tokenizers/decoders/strip.rb
59
60
  - lib/tokenizers/decoders/word_piece.rb
60
61
  - lib/tokenizers/encoding.rb
61
62
  - lib/tokenizers/from_pretrained.rb
@@ -64,6 +65,7 @@ files:
64
65
  - lib/tokenizers/models/word_level.rb
65
66
  - lib/tokenizers/models/word_piece.rb
66
67
  - lib/tokenizers/normalizers/bert_normalizer.rb
68
+ - lib/tokenizers/normalizers/prepend.rb
67
69
  - lib/tokenizers/normalizers/strip.rb
68
70
  - lib/tokenizers/pre_tokenizers/byte_level.rb
69
71
  - lib/tokenizers/pre_tokenizers/digits.rb
@@ -98,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
100
  - !ruby/object:Gem::Version
99
101
  version: '0'
100
102
  requirements: []
101
- rubygems_version: 3.4.6
103
+ rubygems_version: 3.4.10
102
104
  signing_key:
103
105
  specification_version: 4
104
106
  summary: Fast state-of-the-art tokenizers for Ruby