tokenizers 0.2.3-x86_64-linux → 0.3.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/Cargo.lock +32 -73
  4. data/LICENSE-THIRD-PARTY.txt +214 -858
  5. data/README.md +4 -0
  6. data/lib/tokenizers/2.7/tokenizers.so +0 -0
  7. data/lib/tokenizers/3.0/tokenizers.so +0 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/char_bpe_tokenizer.rb +9 -6
  11. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  12. data/lib/tokenizers/decoders/ctc.rb +9 -0
  13. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  14. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  15. data/lib/tokenizers/from_pretrained.rb +1 -1
  16. data/lib/tokenizers/models/bpe.rb +9 -0
  17. data/lib/tokenizers/models/unigram.rb +9 -0
  18. data/lib/tokenizers/models/word_level.rb +13 -0
  19. data/lib/tokenizers/models/word_piece.rb +9 -0
  20. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  21. data/lib/tokenizers/normalizers/strip.rb +9 -0
  22. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  23. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  24. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  25. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  27. data/lib/tokenizers/processors/byte_level.rb +9 -0
  28. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  29. data/lib/tokenizers/processors/template_processing.rb +9 -0
  30. data/lib/tokenizers/tokenizer.rb +40 -7
  31. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  32. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  33. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  34. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  35. data/lib/tokenizers/version.rb +1 -1
  36. data/lib/tokenizers.rb +42 -2
  37. metadata +24 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 51273c1f38d9a2fcbcda6df42b1f8eff718965e84ab49233b6e54bef0825aed4
4
- data.tar.gz: 3ce5e8543e7ac32c6302fcdefde06fdebb249be42db234bbbbe8671bb414a69a
3
+ metadata.gz: 1dd7a6e80756ca265d9152f6fa371e3debcc65ad511a9049d496bfe9265dd2f7
4
+ data.tar.gz: 36bcd374aa6d797fdb44014cc6c6c080042dff9a90c1b28ccebed0fb8d2d68d1
5
5
  SHA512:
6
- metadata.gz: 45946d725ed104ca0001cf323c4bc0146050f051e8f9150aba22f5169640cef2227729ccbd9f705c43ada82ff53703ddaceb52a77259f0096066739c93c13a07
7
- data.tar.gz: b056d1024ab43363c3ba3b0ae1cec2874fab31630de474eb0a6694484b88418bad408155121bf2adc0ed0ed4a70064765565e630d3ba648cb23e6b5c34d9aefc
6
+ metadata.gz: 54b887046decc5dbd22549309b9087f1c79751269202ac1aa6ea04616388dcbbada02b2b926aa3649b9208747811250aad0f56d5cbd751966697b3e4f9c7447c
7
+ data.tar.gz: 8d7259138060315e6dd159a8b9b5622e20a366d72e9728e5f0c8a3a986a39b98f388dc779075ef873b18e5648139a1cbbb17e105172fc760967dbe62442ebc10
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.3.0 (2022-02-07)
2
+
3
+ - Added support for training tokenizers
4
+ - Added more methods to `Tokenizer`
5
+ - Added `encode_batch` method to `Encoding`
6
+ - Added `pair` argument to `encode` method
7
+ - Changed `encode` method to include special tokens by default
8
+ - Changed how offsets are calculated for strings with multibyte characters
9
+
1
10
  ## 0.2.3 (2022-01-22)
2
11
 
3
12
  - Added `add_special_tokens` option to `encode` method
data/Cargo.lock CHANGED
@@ -50,9 +50,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
50
50
 
51
51
  [[package]]
52
52
  name = "cc"
53
- version = "1.0.78"
53
+ version = "1.0.79"
54
54
  source = "registry+https://github.com/rust-lang/crates.io-index"
55
- checksum = "a20104e2335ce8a659d6dd92a51a767a0c062599c73b343fd152cb401e828c3d"
55
+ checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
56
56
 
57
57
  [[package]]
58
58
  name = "cexpr"
@@ -138,9 +138,9 @@ dependencies = [
138
138
 
139
139
  [[package]]
140
140
  name = "darling"
141
- version = "0.14.2"
141
+ version = "0.14.3"
142
142
  source = "registry+https://github.com/rust-lang/crates.io-index"
143
- checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa"
143
+ checksum = "c0808e1bd8671fb44a113a14e13497557533369847788fa2ae912b6ebfce9fa8"
144
144
  dependencies = [
145
145
  "darling_core",
146
146
  "darling_macro",
@@ -148,9 +148,9 @@ dependencies = [
148
148
 
149
149
  [[package]]
150
150
  name = "darling_core"
151
- version = "0.14.2"
151
+ version = "0.14.3"
152
152
  source = "registry+https://github.com/rust-lang/crates.io-index"
153
- checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f"
153
+ checksum = "001d80444f28e193f30c2f293455da62dcf9a6b29918a4253152ae2b1de592cb"
154
154
  dependencies = [
155
155
  "fnv",
156
156
  "ident_case",
@@ -162,9 +162,9 @@ dependencies = [
162
162
 
163
163
  [[package]]
164
164
  name = "darling_macro"
165
- version = "0.14.2"
165
+ version = "0.14.3"
166
166
  source = "registry+https://github.com/rust-lang/crates.io-index"
167
- checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e"
167
+ checksum = "b36230598a2d5de7ec1c6f51f72d8a99a9208daff41de2084d06e3fd3ea56685"
168
168
  dependencies = [
169
169
  "darling_core",
170
170
  "quote",
@@ -202,31 +202,11 @@ dependencies = [
202
202
  "syn",
203
203
  ]
204
204
 
205
- [[package]]
206
- name = "dirs"
207
- version = "3.0.2"
208
- source = "registry+https://github.com/rust-lang/crates.io-index"
209
- checksum = "30baa043103c9d0c2a57cf537cc2f35623889dc0d405e6c3cccfadbc81c71309"
210
- dependencies = [
211
- "dirs-sys",
212
- ]
213
-
214
- [[package]]
215
- name = "dirs-sys"
216
- version = "0.3.7"
217
- source = "registry+https://github.com/rust-lang/crates.io-index"
218
- checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6"
219
- dependencies = [
220
- "libc",
221
- "redox_users",
222
- "winapi",
223
- ]
224
-
225
205
  [[package]]
226
206
  name = "either"
227
- version = "1.8.0"
207
+ version = "1.8.1"
228
208
  source = "registry+https://github.com/rust-lang/crates.io-index"
229
- checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
209
+ checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
230
210
 
231
211
  [[package]]
232
212
  name = "encode_unicode"
@@ -372,9 +352,8 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
372
352
 
373
353
  [[package]]
374
354
  name = "magnus"
375
- version = "0.4.4"
376
- source = "registry+https://github.com/rust-lang/crates.io-index"
377
- checksum = "fc87660cd7daa49fddbfd524c836de54d5c927d520cd163f43700c5087c57d6c"
355
+ version = "0.5.0"
356
+ source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
378
357
  dependencies = [
379
358
  "magnus-macros",
380
359
  "rb-sys",
@@ -384,8 +363,7 @@ dependencies = [
384
363
  [[package]]
385
364
  name = "magnus-macros"
386
365
  version = "0.3.0"
387
- source = "registry+https://github.com/rust-lang/crates.io-index"
388
- checksum = "206cb23bfeea05180c97522ef6a3e52a4eb17b0ed2f30ee3ca9c4f994d2378ae"
366
+ source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
389
367
  dependencies = [
390
368
  "proc-macro2",
391
369
  "quote",
@@ -415,9 +393,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
415
393
 
416
394
  [[package]]
417
395
  name = "nom"
418
- version = "7.1.2"
396
+ version = "7.1.3"
419
397
  source = "registry+https://github.com/rust-lang/crates.io-index"
420
- checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
398
+ checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
421
399
  dependencies = [
422
400
  "memchr",
423
401
  "minimal-lexical",
@@ -493,9 +471,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
493
471
 
494
472
  [[package]]
495
473
  name = "proc-macro2"
496
- version = "1.0.49"
474
+ version = "1.0.51"
497
475
  source = "registry+https://github.com/rust-lang/crates.io-index"
498
- checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
476
+ checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
499
477
  dependencies = [
500
478
  "unicode-ident",
501
479
  ]
@@ -562,9 +540,9 @@ dependencies = [
562
540
 
563
541
  [[package]]
564
542
  name = "rayon-core"
565
- version = "1.10.1"
543
+ version = "1.10.2"
566
544
  source = "registry+https://github.com/rust-lang/crates.io-index"
567
- checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3"
545
+ checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
568
546
  dependencies = [
569
547
  "crossbeam-channel",
570
548
  "crossbeam-deque",
@@ -574,18 +552,18 @@ dependencies = [
574
552
 
575
553
  [[package]]
576
554
  name = "rb-sys"
577
- version = "0.9.56"
555
+ version = "0.9.64"
578
556
  source = "registry+https://github.com/rust-lang/crates.io-index"
579
- checksum = "ef82428221475c6f9e7893fe30b88d45ac86bdb12e58e7c92055ba4bceb78a69"
557
+ checksum = "cc8945662df8083245deda89e236647173cc7ad750f481ddcd7bbfd3afe3fa5e"
580
558
  dependencies = [
581
559
  "rb-sys-build",
582
560
  ]
583
561
 
584
562
  [[package]]
585
563
  name = "rb-sys-build"
586
- version = "0.9.56"
564
+ version = "0.9.64"
587
565
  source = "registry+https://github.com/rust-lang/crates.io-index"
588
- checksum = "950bfc239d2e7704576abe4d37b008876bbfd70a99196a188c5caeae2ba7344a"
566
+ checksum = "ae8c3cdf9edc3908ee1555b7a1bca58ee1b499439b32cd1c1ec3e66736a8df48"
589
567
  dependencies = [
590
568
  "bindgen",
591
569
  "regex",
@@ -594,29 +572,9 @@ dependencies = [
594
572
 
595
573
  [[package]]
596
574
  name = "rb-sys-env"
597
- version = "0.1.1"
598
- source = "registry+https://github.com/rust-lang/crates.io-index"
599
- checksum = "74c38752410925faeb82c400c06ba2fd9ee6aa8f719dd33994c9e53f5242d25f"
600
-
601
- [[package]]
602
- name = "redox_syscall"
603
- version = "0.2.16"
575
+ version = "0.1.2"
604
576
  source = "registry+https://github.com/rust-lang/crates.io-index"
605
- checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
606
- dependencies = [
607
- "bitflags",
608
- ]
609
-
610
- [[package]]
611
- name = "redox_users"
612
- version = "0.4.3"
613
- source = "registry+https://github.com/rust-lang/crates.io-index"
614
- checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
615
- dependencies = [
616
- "getrandom",
617
- "redox_syscall",
618
- "thiserror",
619
- ]
577
+ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
620
578
 
621
579
  [[package]]
622
580
  name = "regex"
@@ -675,9 +633,9 @@ dependencies = [
675
633
 
676
634
  [[package]]
677
635
  name = "serde_json"
678
- version = "1.0.91"
636
+ version = "1.0.92"
679
637
  source = "registry+https://github.com/rust-lang/crates.io-index"
680
- checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883"
638
+ checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a"
681
639
  dependencies = [
682
640
  "itoa",
683
641
  "ryu",
@@ -756,17 +714,18 @@ name = "tokenizers"
756
714
  version = "0.2.3"
757
715
  dependencies = [
758
716
  "magnus",
717
+ "onig",
718
+ "serde",
759
719
  "tokenizers 0.13.2",
760
720
  ]
761
721
 
762
722
  [[package]]
763
723
  name = "tokenizers"
764
724
  version = "0.13.2"
765
- source = "git+https://github.com/huggingface/tokenizers#fe4ae7dc38be11a5c93ae703816c869f993c21ab"
725
+ source = "git+https://github.com/huggingface/tokenizers#fa66caf0abff16bae2213658ffa3e969c5445750"
766
726
  dependencies = [
767
727
  "aho-corasick",
768
728
  "derive_builder",
769
- "dirs",
770
729
  "esaxx-rs",
771
730
  "getrandom",
772
731
  "indicatif",
@@ -807,9 +766,9 @@ dependencies = [
807
766
 
808
767
  [[package]]
809
768
  name = "unicode-segmentation"
810
- version = "1.10.0"
769
+ version = "1.10.1"
811
770
  source = "registry+https://github.com/rust-lang/crates.io-index"
812
- checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
771
+ checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
813
772
 
814
773
  [[package]]
815
774
  name = "unicode-width"