tokenizers 0.2.2-x86_64-linux → 0.3.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +33 -74
- data/LICENSE-THIRD-PARTY.txt +41 -685
- data/README.md +4 -0
- data/lib/tokenizers/2.7/tokenizers.so +0 -0
- data/lib/tokenizers/3.0/tokenizers.so +0 -0
- data/lib/tokenizers/3.1/tokenizers.so +0 -0
- data/lib/tokenizers/3.2/tokenizers.so +0 -0
- data/lib/tokenizers/char_bpe_tokenizer.rb +11 -8
- data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
- data/lib/tokenizers/decoders/ctc.rb +9 -0
- data/lib/tokenizers/decoders/metaspace.rb +9 -0
- data/lib/tokenizers/decoders/word_piece.rb +9 -0
- data/lib/tokenizers/encoding.rb +19 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/models/bpe.rb +9 -0
- data/lib/tokenizers/models/unigram.rb +9 -0
- data/lib/tokenizers/models/word_level.rb +13 -0
- data/lib/tokenizers/models/word_piece.rb +9 -0
- data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
- data/lib/tokenizers/normalizers/strip.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
- data/lib/tokenizers/processors/byte_level.rb +9 -0
- data/lib/tokenizers/processors/roberta_processing.rb +9 -0
- data/lib/tokenizers/processors/template_processing.rb +9 -0
- data/lib/tokenizers/tokenizer.rb +45 -0
- data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
- data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
- data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
- data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +49 -7
- metadata +27 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1dd7a6e80756ca265d9152f6fa371e3debcc65ad511a9049d496bfe9265dd2f7
|
4
|
+
data.tar.gz: 36bcd374aa6d797fdb44014cc6c6c080042dff9a90c1b28ccebed0fb8d2d68d1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 54b887046decc5dbd22549309b9087f1c79751269202ac1aa6ea04616388dcbbada02b2b926aa3649b9208747811250aad0f56d5cbd751966697b3e4f9c7447c
|
7
|
+
data.tar.gz: 8d7259138060315e6dd159a8b9b5622e20a366d72e9728e5f0c8a3a986a39b98f388dc779075ef873b18e5648139a1cbbb17e105172fc760967dbe62442ebc10
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## 0.3.0 (2022-02-07)
|
2
|
+
|
3
|
+
- Added support for training tokenizers
|
4
|
+
- Added more methods to `Tokenizer`
|
5
|
+
- Added `encode_batch` method to `Encoding`
|
6
|
+
- Added `pair` argument to `encode` method
|
7
|
+
- Changed `encode` method to include special tokens by default
|
8
|
+
- Changed how offsets are calculated for strings with multibyte characters
|
9
|
+
|
10
|
+
## 0.2.3 (2022-01-22)
|
11
|
+
|
12
|
+
- Added `add_special_tokens` option to `encode` method
|
13
|
+
- Added warning about `encode` method including special tokens by default in 0.3.0
|
14
|
+
- Added more methods to `Encoding`
|
15
|
+
- Fixed error with precompiled gem on Mac ARM
|
16
|
+
|
1
17
|
## 0.2.2 (2022-01-15)
|
2
18
|
|
3
19
|
- Added precompiled gem for Linux ARM
|
data/Cargo.lock
CHANGED
@@ -50,9 +50,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
50
50
|
|
51
51
|
[[package]]
|
52
52
|
name = "cc"
|
53
|
-
version = "1.0.
|
53
|
+
version = "1.0.79"
|
54
54
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
55
|
-
checksum = "
|
55
|
+
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
|
56
56
|
|
57
57
|
[[package]]
|
58
58
|
name = "cexpr"
|
@@ -138,9 +138,9 @@ dependencies = [
|
|
138
138
|
|
139
139
|
[[package]]
|
140
140
|
name = "darling"
|
141
|
-
version = "0.14.
|
141
|
+
version = "0.14.3"
|
142
142
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
143
|
-
checksum = "
|
143
|
+
checksum = "c0808e1bd8671fb44a113a14e13497557533369847788fa2ae912b6ebfce9fa8"
|
144
144
|
dependencies = [
|
145
145
|
"darling_core",
|
146
146
|
"darling_macro",
|
@@ -148,9 +148,9 @@ dependencies = [
|
|
148
148
|
|
149
149
|
[[package]]
|
150
150
|
name = "darling_core"
|
151
|
-
version = "0.14.
|
151
|
+
version = "0.14.3"
|
152
152
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
153
|
-
checksum = "
|
153
|
+
checksum = "001d80444f28e193f30c2f293455da62dcf9a6b29918a4253152ae2b1de592cb"
|
154
154
|
dependencies = [
|
155
155
|
"fnv",
|
156
156
|
"ident_case",
|
@@ -162,9 +162,9 @@ dependencies = [
|
|
162
162
|
|
163
163
|
[[package]]
|
164
164
|
name = "darling_macro"
|
165
|
-
version = "0.14.
|
165
|
+
version = "0.14.3"
|
166
166
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
167
|
-
checksum = "
|
167
|
+
checksum = "b36230598a2d5de7ec1c6f51f72d8a99a9208daff41de2084d06e3fd3ea56685"
|
168
168
|
dependencies = [
|
169
169
|
"darling_core",
|
170
170
|
"quote",
|
@@ -202,31 +202,11 @@ dependencies = [
|
|
202
202
|
"syn",
|
203
203
|
]
|
204
204
|
|
205
|
-
[[package]]
|
206
|
-
name = "dirs"
|
207
|
-
version = "3.0.2"
|
208
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
209
|
-
checksum = "30baa043103c9d0c2a57cf537cc2f35623889dc0d405e6c3cccfadbc81c71309"
|
210
|
-
dependencies = [
|
211
|
-
"dirs-sys",
|
212
|
-
]
|
213
|
-
|
214
|
-
[[package]]
|
215
|
-
name = "dirs-sys"
|
216
|
-
version = "0.3.7"
|
217
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
218
|
-
checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6"
|
219
|
-
dependencies = [
|
220
|
-
"libc",
|
221
|
-
"redox_users",
|
222
|
-
"winapi",
|
223
|
-
]
|
224
|
-
|
225
205
|
[[package]]
|
226
206
|
name = "either"
|
227
|
-
version = "1.8.
|
207
|
+
version = "1.8.1"
|
228
208
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
229
|
-
checksum = "
|
209
|
+
checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
|
230
210
|
|
231
211
|
[[package]]
|
232
212
|
name = "encode_unicode"
|
@@ -372,9 +352,8 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
372
352
|
|
373
353
|
[[package]]
|
374
354
|
name = "magnus"
|
375
|
-
version = "0.
|
376
|
-
source = "
|
377
|
-
checksum = "fc87660cd7daa49fddbfd524c836de54d5c927d520cd163f43700c5087c57d6c"
|
355
|
+
version = "0.5.0"
|
356
|
+
source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
|
378
357
|
dependencies = [
|
379
358
|
"magnus-macros",
|
380
359
|
"rb-sys",
|
@@ -384,8 +363,7 @@ dependencies = [
|
|
384
363
|
[[package]]
|
385
364
|
name = "magnus-macros"
|
386
365
|
version = "0.3.0"
|
387
|
-
source = "
|
388
|
-
checksum = "206cb23bfeea05180c97522ef6a3e52a4eb17b0ed2f30ee3ca9c4f994d2378ae"
|
366
|
+
source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
|
389
367
|
dependencies = [
|
390
368
|
"proc-macro2",
|
391
369
|
"quote",
|
@@ -415,9 +393,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
415
393
|
|
416
394
|
[[package]]
|
417
395
|
name = "nom"
|
418
|
-
version = "7.1.
|
396
|
+
version = "7.1.3"
|
419
397
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
420
|
-
checksum = "
|
398
|
+
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
|
421
399
|
dependencies = [
|
422
400
|
"memchr",
|
423
401
|
"minimal-lexical",
|
@@ -493,9 +471,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
493
471
|
|
494
472
|
[[package]]
|
495
473
|
name = "proc-macro2"
|
496
|
-
version = "1.0.
|
474
|
+
version = "1.0.51"
|
497
475
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
498
|
-
checksum = "
|
476
|
+
checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
|
499
477
|
dependencies = [
|
500
478
|
"unicode-ident",
|
501
479
|
]
|
@@ -562,9 +540,9 @@ dependencies = [
|
|
562
540
|
|
563
541
|
[[package]]
|
564
542
|
name = "rayon-core"
|
565
|
-
version = "1.10.
|
543
|
+
version = "1.10.2"
|
566
544
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
567
|
-
checksum = "
|
545
|
+
checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
|
568
546
|
dependencies = [
|
569
547
|
"crossbeam-channel",
|
570
548
|
"crossbeam-deque",
|
@@ -574,18 +552,18 @@ dependencies = [
|
|
574
552
|
|
575
553
|
[[package]]
|
576
554
|
name = "rb-sys"
|
577
|
-
version = "0.9.
|
555
|
+
version = "0.9.64"
|
578
556
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
579
|
-
checksum = "
|
557
|
+
checksum = "cc8945662df8083245deda89e236647173cc7ad750f481ddcd7bbfd3afe3fa5e"
|
580
558
|
dependencies = [
|
581
559
|
"rb-sys-build",
|
582
560
|
]
|
583
561
|
|
584
562
|
[[package]]
|
585
563
|
name = "rb-sys-build"
|
586
|
-
version = "0.9.
|
564
|
+
version = "0.9.64"
|
587
565
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
588
|
-
checksum = "
|
566
|
+
checksum = "ae8c3cdf9edc3908ee1555b7a1bca58ee1b499439b32cd1c1ec3e66736a8df48"
|
589
567
|
dependencies = [
|
590
568
|
"bindgen",
|
591
569
|
"regex",
|
@@ -594,29 +572,9 @@ dependencies = [
|
|
594
572
|
|
595
573
|
[[package]]
|
596
574
|
name = "rb-sys-env"
|
597
|
-
version = "0.1.
|
598
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
599
|
-
checksum = "74c38752410925faeb82c400c06ba2fd9ee6aa8f719dd33994c9e53f5242d25f"
|
600
|
-
|
601
|
-
[[package]]
|
602
|
-
name = "redox_syscall"
|
603
|
-
version = "0.2.16"
|
575
|
+
version = "0.1.2"
|
604
576
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
605
|
-
checksum = "
|
606
|
-
dependencies = [
|
607
|
-
"bitflags",
|
608
|
-
]
|
609
|
-
|
610
|
-
[[package]]
|
611
|
-
name = "redox_users"
|
612
|
-
version = "0.4.3"
|
613
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
614
|
-
checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
|
615
|
-
dependencies = [
|
616
|
-
"getrandom",
|
617
|
-
"redox_syscall",
|
618
|
-
"thiserror",
|
619
|
-
]
|
577
|
+
checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
620
578
|
|
621
579
|
[[package]]
|
622
580
|
name = "regex"
|
@@ -675,9 +633,9 @@ dependencies = [
|
|
675
633
|
|
676
634
|
[[package]]
|
677
635
|
name = "serde_json"
|
678
|
-
version = "1.0.
|
636
|
+
version = "1.0.92"
|
679
637
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
680
|
-
checksum = "
|
638
|
+
checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a"
|
681
639
|
dependencies = [
|
682
640
|
"itoa",
|
683
641
|
"ryu",
|
@@ -753,20 +711,21 @@ dependencies = [
|
|
753
711
|
|
754
712
|
[[package]]
|
755
713
|
name = "tokenizers"
|
756
|
-
version = "0.2.
|
714
|
+
version = "0.2.3"
|
757
715
|
dependencies = [
|
758
716
|
"magnus",
|
717
|
+
"onig",
|
718
|
+
"serde",
|
759
719
|
"tokenizers 0.13.2",
|
760
720
|
]
|
761
721
|
|
762
722
|
[[package]]
|
763
723
|
name = "tokenizers"
|
764
724
|
version = "0.13.2"
|
765
|
-
source = "git+https://github.com/huggingface/tokenizers#
|
725
|
+
source = "git+https://github.com/huggingface/tokenizers#fa66caf0abff16bae2213658ffa3e969c5445750"
|
766
726
|
dependencies = [
|
767
727
|
"aho-corasick",
|
768
728
|
"derive_builder",
|
769
|
-
"dirs",
|
770
729
|
"esaxx-rs",
|
771
730
|
"getrandom",
|
772
731
|
"indicatif",
|
@@ -807,9 +766,9 @@ dependencies = [
|
|
807
766
|
|
808
767
|
[[package]]
|
809
768
|
name = "unicode-segmentation"
|
810
|
-
version = "1.10.
|
769
|
+
version = "1.10.1"
|
811
770
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
812
|
-
checksum = "
|
771
|
+
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
813
772
|
|
814
773
|
[[package]]
|
815
774
|
name = "unicode-width"
|