tokenizers 0.2.3-aarch64-linux → 0.3.0-aarch64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/Cargo.lock +32 -73
  4. data/LICENSE-THIRD-PARTY.txt +214 -858
  5. data/README.md +4 -0
  6. data/lib/tokenizers/2.7/tokenizers.so +0 -0
  7. data/lib/tokenizers/3.0/tokenizers.so +0 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/char_bpe_tokenizer.rb +9 -6
  11. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  12. data/lib/tokenizers/decoders/ctc.rb +9 -0
  13. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  14. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  15. data/lib/tokenizers/from_pretrained.rb +1 -1
  16. data/lib/tokenizers/models/bpe.rb +9 -0
  17. data/lib/tokenizers/models/unigram.rb +9 -0
  18. data/lib/tokenizers/models/word_level.rb +13 -0
  19. data/lib/tokenizers/models/word_piece.rb +9 -0
  20. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  21. data/lib/tokenizers/normalizers/strip.rb +9 -0
  22. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  23. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  24. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  25. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  27. data/lib/tokenizers/processors/byte_level.rb +9 -0
  28. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  29. data/lib/tokenizers/processors/template_processing.rb +9 -0
  30. data/lib/tokenizers/tokenizer.rb +40 -7
  31. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  32. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  33. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  34. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  35. data/lib/tokenizers/version.rb +1 -1
  36. data/lib/tokenizers.rb +42 -2
  37. metadata +24 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ebac1b83e37b024ec99ac2e96b1085bdbdd060207b53a0d5411f826c15e7a05
4
- data.tar.gz: 1278eb2b6c661f441f65eb53a28595690c51b532a5e468b606ad75fa57aaecba
3
+ metadata.gz: 373d0c9f060b073944f6fc29ccde5d76db8acb7c8dbcec33c6ca5e19c6e1d1b6
4
+ data.tar.gz: 139ad4dea993c1f11d67c7a6c1accb80adb4fae19f3d1caabf53743ea7504c11
5
5
  SHA512:
6
- metadata.gz: 6ddca76a32fdf761150981de37a876b4f3b20283f77fd3b1adb0dbfb3be52af7d95d4ed6d883c7ed98cf4fd6ce32ea50f2396781400ab7071d84b340dc8ccbd2
7
- data.tar.gz: 0a53cb1b001870aae8fbfdfcf6e93fc83ec19543327338764c2d8bd4af4d2ec8620dfe1b8f5e1f83bf0a45fe123e7973664c86650622e99ecd01839b0da6a450
6
+ metadata.gz: 10244c830e5638aee1b825dab7b836a4a1fbdbd8ac3fa3c54b5714686299abb42c48da830345de08f523adc8ed2350f09c35c8e570744cf10999655309ea6083
7
+ data.tar.gz: f0aa5dcc193e7181878baaad63f7e6f385bc555def23678bdc86abd28f403a3d65d48035c51671876f8ba3265bc804f881c131200f411f23250f98da2ea0d2c3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.3.0 (2022-02-07)
2
+
3
+ - Added support for training tokenizers
4
+ - Added more methods to `Tokenizer`
5
+ - Added `encode_batch` method to `Encoding`
6
+ - Added `pair` argument to `encode` method
7
+ - Changed `encode` method to include special tokens by default
8
+ - Changed how offsets are calculated for strings with multibyte characters
9
+
1
10
  ## 0.2.3 (2022-01-22)
2
11
 
3
12
  - Added `add_special_tokens` option to `encode` method
data/Cargo.lock CHANGED
@@ -50,9 +50,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
50
50
 
51
51
  [[package]]
52
52
  name = "cc"
53
- version = "1.0.78"
53
+ version = "1.0.79"
54
54
  source = "registry+https://github.com/rust-lang/crates.io-index"
55
- checksum = "a20104e2335ce8a659d6dd92a51a767a0c062599c73b343fd152cb401e828c3d"
55
+ checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
56
56
 
57
57
  [[package]]
58
58
  name = "cexpr"
@@ -138,9 +138,9 @@ dependencies = [
138
138
 
139
139
  [[package]]
140
140
  name = "darling"
141
- version = "0.14.2"
141
+ version = "0.14.3"
142
142
  source = "registry+https://github.com/rust-lang/crates.io-index"
143
- checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa"
143
+ checksum = "c0808e1bd8671fb44a113a14e13497557533369847788fa2ae912b6ebfce9fa8"
144
144
  dependencies = [
145
145
  "darling_core",
146
146
  "darling_macro",
@@ -148,9 +148,9 @@ dependencies = [
148
148
 
149
149
  [[package]]
150
150
  name = "darling_core"
151
- version = "0.14.2"
151
+ version = "0.14.3"
152
152
  source = "registry+https://github.com/rust-lang/crates.io-index"
153
- checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f"
153
+ checksum = "001d80444f28e193f30c2f293455da62dcf9a6b29918a4253152ae2b1de592cb"
154
154
  dependencies = [
155
155
  "fnv",
156
156
  "ident_case",
@@ -162,9 +162,9 @@ dependencies = [
162
162
 
163
163
  [[package]]
164
164
  name = "darling_macro"
165
- version = "0.14.2"
165
+ version = "0.14.3"
166
166
  source = "registry+https://github.com/rust-lang/crates.io-index"
167
- checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e"
167
+ checksum = "b36230598a2d5de7ec1c6f51f72d8a99a9208daff41de2084d06e3fd3ea56685"
168
168
  dependencies = [
169
169
  "darling_core",
170
170
  "quote",
@@ -202,31 +202,11 @@ dependencies = [
202
202
  "syn",
203
203
  ]
204
204
 
205
- [[package]]
206
- name = "dirs"
207
- version = "3.0.2"
208
- source = "registry+https://github.com/rust-lang/crates.io-index"
209
- checksum = "30baa043103c9d0c2a57cf537cc2f35623889dc0d405e6c3cccfadbc81c71309"
210
- dependencies = [
211
- "dirs-sys",
212
- ]
213
-
214
- [[package]]
215
- name = "dirs-sys"
216
- version = "0.3.7"
217
- source = "registry+https://github.com/rust-lang/crates.io-index"
218
- checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6"
219
- dependencies = [
220
- "libc",
221
- "redox_users",
222
- "winapi",
223
- ]
224
-
225
205
  [[package]]
226
206
  name = "either"
227
- version = "1.8.0"
207
+ version = "1.8.1"
228
208
  source = "registry+https://github.com/rust-lang/crates.io-index"
229
- checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
209
+ checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
230
210
 
231
211
  [[package]]
232
212
  name = "encode_unicode"
@@ -372,9 +352,8 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
372
352
 
373
353
  [[package]]
374
354
  name = "magnus"
375
- version = "0.4.4"
376
- source = "registry+https://github.com/rust-lang/crates.io-index"
377
- checksum = "fc87660cd7daa49fddbfd524c836de54d5c927d520cd163f43700c5087c57d6c"
355
+ version = "0.5.0"
356
+ source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
378
357
  dependencies = [
379
358
  "magnus-macros",
380
359
  "rb-sys",
@@ -384,8 +363,7 @@ dependencies = [
384
363
  [[package]]
385
364
  name = "magnus-macros"
386
365
  version = "0.3.0"
387
- source = "registry+https://github.com/rust-lang/crates.io-index"
388
- checksum = "206cb23bfeea05180c97522ef6a3e52a4eb17b0ed2f30ee3ca9c4f994d2378ae"
366
+ source = "git+https://github.com/matsadler/magnus#eda735faa7e03da2443eaf2c4058a184917d6b87"
389
367
  dependencies = [
390
368
  "proc-macro2",
391
369
  "quote",
@@ -415,9 +393,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
415
393
 
416
394
  [[package]]
417
395
  name = "nom"
418
- version = "7.1.2"
396
+ version = "7.1.3"
419
397
  source = "registry+https://github.com/rust-lang/crates.io-index"
420
- checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
398
+ checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
421
399
  dependencies = [
422
400
  "memchr",
423
401
  "minimal-lexical",
@@ -493,9 +471,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
493
471
 
494
472
  [[package]]
495
473
  name = "proc-macro2"
496
- version = "1.0.49"
474
+ version = "1.0.51"
497
475
  source = "registry+https://github.com/rust-lang/crates.io-index"
498
- checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
476
+ checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
499
477
  dependencies = [
500
478
  "unicode-ident",
501
479
  ]
@@ -562,9 +540,9 @@ dependencies = [
562
540
 
563
541
  [[package]]
564
542
  name = "rayon-core"
565
- version = "1.10.1"
543
+ version = "1.10.2"
566
544
  source = "registry+https://github.com/rust-lang/crates.io-index"
567
- checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3"
545
+ checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
568
546
  dependencies = [
569
547
  "crossbeam-channel",
570
548
  "crossbeam-deque",
@@ -574,18 +552,18 @@ dependencies = [
574
552
 
575
553
  [[package]]
576
554
  name = "rb-sys"
577
- version = "0.9.56"
555
+ version = "0.9.64"
578
556
  source = "registry+https://github.com/rust-lang/crates.io-index"
579
- checksum = "ef82428221475c6f9e7893fe30b88d45ac86bdb12e58e7c92055ba4bceb78a69"
557
+ checksum = "cc8945662df8083245deda89e236647173cc7ad750f481ddcd7bbfd3afe3fa5e"
580
558
  dependencies = [
581
559
  "rb-sys-build",
582
560
  ]
583
561
 
584
562
  [[package]]
585
563
  name = "rb-sys-build"
586
- version = "0.9.56"
564
+ version = "0.9.64"
587
565
  source = "registry+https://github.com/rust-lang/crates.io-index"
588
- checksum = "950bfc239d2e7704576abe4d37b008876bbfd70a99196a188c5caeae2ba7344a"
566
+ checksum = "ae8c3cdf9edc3908ee1555b7a1bca58ee1b499439b32cd1c1ec3e66736a8df48"
589
567
  dependencies = [
590
568
  "bindgen",
591
569
  "regex",
@@ -594,29 +572,9 @@ dependencies = [
594
572
 
595
573
  [[package]]
596
574
  name = "rb-sys-env"
597
- version = "0.1.1"
598
- source = "registry+https://github.com/rust-lang/crates.io-index"
599
- checksum = "74c38752410925faeb82c400c06ba2fd9ee6aa8f719dd33994c9e53f5242d25f"
600
-
601
- [[package]]
602
- name = "redox_syscall"
603
- version = "0.2.16"
575
+ version = "0.1.2"
604
576
  source = "registry+https://github.com/rust-lang/crates.io-index"
605
- checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
606
- dependencies = [
607
- "bitflags",
608
- ]
609
-
610
- [[package]]
611
- name = "redox_users"
612
- version = "0.4.3"
613
- source = "registry+https://github.com/rust-lang/crates.io-index"
614
- checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
615
- dependencies = [
616
- "getrandom",
617
- "redox_syscall",
618
- "thiserror",
619
- ]
577
+ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
620
578
 
621
579
  [[package]]
622
580
  name = "regex"
@@ -675,9 +633,9 @@ dependencies = [
675
633
 
676
634
  [[package]]
677
635
  name = "serde_json"
678
- version = "1.0.91"
636
+ version = "1.0.92"
679
637
  source = "registry+https://github.com/rust-lang/crates.io-index"
680
- checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883"
638
+ checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a"
681
639
  dependencies = [
682
640
  "itoa",
683
641
  "ryu",
@@ -756,17 +714,18 @@ name = "tokenizers"
756
714
  version = "0.2.3"
757
715
  dependencies = [
758
716
  "magnus",
717
+ "onig",
718
+ "serde",
759
719
  "tokenizers 0.13.2",
760
720
  ]
761
721
 
762
722
  [[package]]
763
723
  name = "tokenizers"
764
724
  version = "0.13.2"
765
- source = "git+https://github.com/huggingface/tokenizers#fe4ae7dc38be11a5c93ae703816c869f993c21ab"
725
+ source = "git+https://github.com/huggingface/tokenizers#fa66caf0abff16bae2213658ffa3e969c5445750"
766
726
  dependencies = [
767
727
  "aho-corasick",
768
728
  "derive_builder",
769
- "dirs",
770
729
  "esaxx-rs",
771
730
  "getrandom",
772
731
  "indicatif",
@@ -807,9 +766,9 @@ dependencies = [
807
766
 
808
767
  [[package]]
809
768
  name = "unicode-segmentation"
810
- version = "1.10.0"
769
+ version = "1.10.1"
811
770
  source = "registry+https://github.com/rust-lang/crates.io-index"
812
- checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
771
+ checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
813
772
 
814
773
  [[package]]
815
774
  name = "unicode-width"