tokenizers 0.2.2-x86_64-linux → 0.3.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +33 -74
- data/LICENSE-THIRD-PARTY.txt +41 -685
- data/README.md +4 -0
- data/lib/tokenizers/2.7/tokenizers.so +0 -0
- data/lib/tokenizers/3.0/tokenizers.so +0 -0
- data/lib/tokenizers/3.1/tokenizers.so +0 -0
- data/lib/tokenizers/3.2/tokenizers.so +0 -0
- data/lib/tokenizers/char_bpe_tokenizer.rb +11 -8
- data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
- data/lib/tokenizers/decoders/ctc.rb +9 -0
- data/lib/tokenizers/decoders/metaspace.rb +9 -0
- data/lib/tokenizers/decoders/word_piece.rb +9 -0
- data/lib/tokenizers/encoding.rb +19 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/models/bpe.rb +9 -0
- data/lib/tokenizers/models/unigram.rb +9 -0
- data/lib/tokenizers/models/word_level.rb +13 -0
- data/lib/tokenizers/models/word_piece.rb +9 -0
- data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
- data/lib/tokenizers/normalizers/strip.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
- data/lib/tokenizers/processors/byte_level.rb +9 -0
- data/lib/tokenizers/processors/roberta_processing.rb +9 -0
- data/lib/tokenizers/processors/template_processing.rb +9 -0
- data/lib/tokenizers/tokenizer.rb +45 -0
- data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
- data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
- data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
- data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +49 -7
- metadata +27 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1dd7a6e80756ca265d9152f6fa371e3debcc65ad511a9049d496bfe9265dd2f7
|
4
|
+
data.tar.gz: 36bcd374aa6d797fdb44014cc6c6c080042dff9a90c1b28ccebed0fb8d2d68d1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 54b887046decc5dbd22549309b9087f1c79751269202ac1aa6ea04616388dcbbada02b2b926aa3649b9208747811250aad0f56d5cbd751966697b3e4f9c7447c
|
7
|
+
data.tar.gz: 8d7259138060315e6dd159a8b9b5622e20a366d72e9728e5f0c8a3a986a39b98f388dc779075ef873b18e5648139a1cbbb17e105172fc760967dbe62442ebc10
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## 0.3.0 (2022-02-07)
|
2
|
+
|
3
|
+
- Added support for training tokenizers
|
4
|
+
- Added more methods to `Tokenizer`
|
5
|
+
- Added `encode_batch` method to `Encoding`
|
6
|
+
- Added `pair` argument to `encode` method
|
7
|
+
- Changed `encode` method to include special tokens by default
|
8
|
+
- Changed how offsets are calculated for strings with multibyte characters
|
9
|
+
|
10
|
+
## 0.2.3 (2022-01-22)
|
11
|
+
|
12
|
+
- Added `add_special_tokens` option to `encode` method
|
13
|
+
- Added warning about `encode` method including special tokens by default in 0.3.0
|
14
|
+
- Added more methods to `Encoding`
|
15
|
+
- Fixed error with precompiled gem on Mac ARM
|
16
|
+
|
1
17
|
## 0.2.2 (2022-01-15)
|
2
18
|
|
3
19
|
- Added precompiled gem for Linux ARM
|
data/Cargo.lock
CHANGED
@@ -50,9 +50,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
50
50
|
|
51
51
|
[[package]]
|
52
52
|
name = "cc"
|
53
|
-
version = "1.0.
|
53
|
+
version = "1.0.79"
|
54
54
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
55
|
-
checksum = "
|
55
|
+
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
|
56
56
|
|
57
57
|
[[package]]
|
58
58
|
name = "cexpr"
|
@@ -138,9 +138,9 @@ dependencies = [
|
|
138
138
|
|
139
139
|
[[package]]
|
140
140
|
name = "darling"
|
141
|
-
version = "0.14.
|
141
|
+
version = "0.14.3"
|
142
142
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
143
|
-
checksum = "
|
143
|
+
checksum = "c0808e1bd8671fb44a113a14e13497557533369847788fa2ae912b6ebfce9fa8"
|
144
144
|
dependencies = [
|
145
145
|
"darling_core",
|
146
146
|
"darling_macro",
|
@@ -148,9 +148,9 @@ dependencies = [
|
|
148
148
|
|
149
149
|
[[package]]
|
150
150
|
name = "darling_core"
|
151
|
-
version = "0.14.
|
151
|
+
version = "0.14.3"
|
152
152
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
153
|
-
checksum = "
|
153
|
+
checksum = "001d80444f28e193f30c2f293455da62dcf9a6b29918a4253152ae2b1de592cb"
|
154
154
|
dependencies = [
|
155
155
|
"fnv",
|
156
156
|
"ident_case",
|
@@ -162,9 +162,9 @@ dependencies = [
|
|
162
162
|
|
163
163
|
[[package]]
|
164
164
|
name = "darling_macro"
|
165
|
-
version = "0.14.
|
165
|
+
version = "0.14.3"
|
166
166
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
167
|
-
checksum = "
|
167
|
+
checksum = "b36230598a2d5de7ec1c6f51f72d8a99a9208daff41de2084d06e3fd3ea56685"
|
168
168
|
dependencies = [
|
169
169
|
"darling_core",
|
170
170
|
"quote",
|
@@ -202,31 +202,11 @@ dependencies = [
|
|
202
202
|
"syn",
|
203
203
|
]
|
204
204
|
|
205
|
-
[[package]]
|
206
|
-
name = "dirs"
|
207
|
-
version = "3.0.2"
|
208
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
209
|
-
checksum = "30baa043103c9d0c2a57cf537cc2f35623889dc0d405e6c3cccfadbc81c71309"
|
210
|
-
dependencies = [
|
211
|
-
"dirs-sys",
|
212
|
-
]
|
213
|
-
|
214
|
-
[[package]]
|
215
|
-
name = "dirs-sys"
|
216
|
-
version = "0.3.7"
|
217
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
218
|
-
checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6"
|
219
|
-
dependencies = [
|
220
|
-
"libc",
|
221
|
-
"redox_users",
|
222
|
-
"winapi",
|
223
|
-
]
|
224
|
-
|
225
205
|
[[package]]
|
226
206
|
name = "either"
|
227
|
-
version = "1.8.
|
207
|
+
version = "1.8.1"
|
228
208
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
229
|
-
checksum = "
|
209
|
+
checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
|
230
210
|
|
231
211
|
[[package]]
|
232
212
|
name = "encode_unicode"
|
@@ -372,9 +352,8 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
372
352
|
|
373
353
|
[[package]]
|
374
354
|
name = "magnus"
|
375
|
-
version = "0.
|
376
|
-
source = "
|
377
|
-
checksum = "fc87660cd7daa49fddbfd524c836de54d5c927d520cd163f43700c5087c57d6c"
|
355
|
+
version = "0.5.0"
|
356
|
+
source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
|
378
357
|
dependencies = [
|
379
358
|
"magnus-macros",
|
380
359
|
"rb-sys",
|
@@ -384,8 +363,7 @@ dependencies = [
|
|
384
363
|
[[package]]
|
385
364
|
name = "magnus-macros"
|
386
365
|
version = "0.3.0"
|
387
|
-
source = "
|
388
|
-
checksum = "206cb23bfeea05180c97522ef6a3e52a4eb17b0ed2f30ee3ca9c4f994d2378ae"
|
366
|
+
source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
|
389
367
|
dependencies = [
|
390
368
|
"proc-macro2",
|
391
369
|
"quote",
|
@@ -415,9 +393,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
415
393
|
|
416
394
|
[[package]]
|
417
395
|
name = "nom"
|
418
|
-
version = "7.1.
|
396
|
+
version = "7.1.3"
|
419
397
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
420
|
-
checksum = "
|
398
|
+
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
|
421
399
|
dependencies = [
|
422
400
|
"memchr",
|
423
401
|
"minimal-lexical",
|
@@ -493,9 +471,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
493
471
|
|
494
472
|
[[package]]
|
495
473
|
name = "proc-macro2"
|
496
|
-
version = "1.0.
|
474
|
+
version = "1.0.51"
|
497
475
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
498
|
-
checksum = "
|
476
|
+
checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
|
499
477
|
dependencies = [
|
500
478
|
"unicode-ident",
|
501
479
|
]
|
@@ -562,9 +540,9 @@ dependencies = [
|
|
562
540
|
|
563
541
|
[[package]]
|
564
542
|
name = "rayon-core"
|
565
|
-
version = "1.10.
|
543
|
+
version = "1.10.2"
|
566
544
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
567
|
-
checksum = "
|
545
|
+
checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
|
568
546
|
dependencies = [
|
569
547
|
"crossbeam-channel",
|
570
548
|
"crossbeam-deque",
|
@@ -574,18 +552,18 @@ dependencies = [
|
|
574
552
|
|
575
553
|
[[package]]
|
576
554
|
name = "rb-sys"
|
577
|
-
version = "0.9.
|
555
|
+
version = "0.9.64"
|
578
556
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
579
|
-
checksum = "
|
557
|
+
checksum = "cc8945662df8083245deda89e236647173cc7ad750f481ddcd7bbfd3afe3fa5e"
|
580
558
|
dependencies = [
|
581
559
|
"rb-sys-build",
|
582
560
|
]
|
583
561
|
|
584
562
|
[[package]]
|
585
563
|
name = "rb-sys-build"
|
586
|
-
version = "0.9.
|
564
|
+
version = "0.9.64"
|
587
565
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
588
|
-
checksum = "
|
566
|
+
checksum = "ae8c3cdf9edc3908ee1555b7a1bca58ee1b499439b32cd1c1ec3e66736a8df48"
|
589
567
|
dependencies = [
|
590
568
|
"bindgen",
|
591
569
|
"regex",
|
@@ -594,29 +572,9 @@ dependencies = [
|
|
594
572
|
|
595
573
|
[[package]]
|
596
574
|
name = "rb-sys-env"
|
597
|
-
version = "0.1.
|
598
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
599
|
-
checksum = "74c38752410925faeb82c400c06ba2fd9ee6aa8f719dd33994c9e53f5242d25f"
|
600
|
-
|
601
|
-
[[package]]
|
602
|
-
name = "redox_syscall"
|
603
|
-
version = "0.2.16"
|
575
|
+
version = "0.1.2"
|
604
576
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
605
|
-
checksum = "
|
606
|
-
dependencies = [
|
607
|
-
"bitflags",
|
608
|
-
]
|
609
|
-
|
610
|
-
[[package]]
|
611
|
-
name = "redox_users"
|
612
|
-
version = "0.4.3"
|
613
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
614
|
-
checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
|
615
|
-
dependencies = [
|
616
|
-
"getrandom",
|
617
|
-
"redox_syscall",
|
618
|
-
"thiserror",
|
619
|
-
]
|
577
|
+
checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
620
578
|
|
621
579
|
[[package]]
|
622
580
|
name = "regex"
|
@@ -675,9 +633,9 @@ dependencies = [
|
|
675
633
|
|
676
634
|
[[package]]
|
677
635
|
name = "serde_json"
|
678
|
-
version = "1.0.
|
636
|
+
version = "1.0.92"
|
679
637
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
680
|
-
checksum = "
|
638
|
+
checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a"
|
681
639
|
dependencies = [
|
682
640
|
"itoa",
|
683
641
|
"ryu",
|
@@ -753,20 +711,21 @@ dependencies = [
|
|
753
711
|
|
754
712
|
[[package]]
|
755
713
|
name = "tokenizers"
|
756
|
-
version = "0.2.
|
714
|
+
version = "0.2.3"
|
757
715
|
dependencies = [
|
758
716
|
"magnus",
|
717
|
+
"onig",
|
718
|
+
"serde",
|
759
719
|
"tokenizers 0.13.2",
|
760
720
|
]
|
761
721
|
|
762
722
|
[[package]]
|
763
723
|
name = "tokenizers"
|
764
724
|
version = "0.13.2"
|
765
|
-
source = "git+https://github.com/huggingface/tokenizers#
|
725
|
+
source = "git+https://github.com/huggingface/tokenizers#fa66caf0abff16bae2213658ffa3e969c5445750"
|
766
726
|
dependencies = [
|
767
727
|
"aho-corasick",
|
768
728
|
"derive_builder",
|
769
|
-
"dirs",
|
770
729
|
"esaxx-rs",
|
771
730
|
"getrandom",
|
772
731
|
"indicatif",
|
@@ -807,9 +766,9 @@ dependencies = [
|
|
807
766
|
|
808
767
|
[[package]]
|
809
768
|
name = "unicode-segmentation"
|
810
|
-
version = "1.10.
|
769
|
+
version = "1.10.1"
|
811
770
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
812
|
-
checksum = "
|
771
|
+
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
813
772
|
|
814
773
|
[[package]]
|
815
774
|
name = "unicode-width"
|