tokenizers 0.3.2-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +56 -0
  3. data/Cargo.lock +873 -0
  4. data/Cargo.toml +5 -0
  5. data/LICENSE-THIRD-PARTY.txt +17286 -0
  6. data/LICENSE.txt +202 -0
  7. data/README.md +69 -0
  8. data/lib/tokenizers/2.7/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.0/tokenizers.so +0 -0
  10. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  11. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  12. data/lib/tokenizers/char_bpe_tokenizer.rb +22 -0
  13. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  14. data/lib/tokenizers/decoders/ctc.rb +9 -0
  15. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  16. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  17. data/lib/tokenizers/encoding.rb +19 -0
  18. data/lib/tokenizers/from_pretrained.rb +119 -0
  19. data/lib/tokenizers/models/bpe.rb +9 -0
  20. data/lib/tokenizers/models/unigram.rb +9 -0
  21. data/lib/tokenizers/models/word_level.rb +13 -0
  22. data/lib/tokenizers/models/word_piece.rb +9 -0
  23. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  24. data/lib/tokenizers/normalizers/strip.rb +9 -0
  25. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  28. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  29. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  30. data/lib/tokenizers/processors/byte_level.rb +9 -0
  31. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  32. data/lib/tokenizers/processors/template_processing.rb +9 -0
  33. data/lib/tokenizers/tokenizer.rb +45 -0
  34. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  35. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  36. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  37. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  38. data/lib/tokenizers/version.rb +3 -0
  39. data/lib/tokenizers.rb +59 -0
  40. metadata +83 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 05c593445e9f2ff1de6bd1cb33c7030322257f17d648adde49cfe32ea3e14c63
4
+ data.tar.gz: 5db423819590e36b9980956615cc99b20458eabf562bf416af0fdaee6bd34831
5
+ SHA512:
6
+ metadata.gz: '090b5592fcd6270b281ab927bdc97160bdd85aea1700a1067c453e1ee2a685baf8fe907c28ae8ea22cbedc6c2f2d35b6a206c25d67aea3aa7f4233168d0ebd05'
7
+ data.tar.gz: 67236754456f1da6a28fe515f5f18fefefdfc6f351a31d57d3a40dbb9bb1929e33224df8550baa0c0b393e4fe1ce266610e5e4b0edaad416625395b407ea1e1d
data/CHANGELOG.md ADDED
@@ -0,0 +1,56 @@
1
+ ## 0.3.2 (2023-03-06)
2
+
3
+ - Added precompiled gem for Linux x86-64 MUSL
4
+
5
+ ## 0.3.1 (2023-02-08)
6
+
7
+ - Fixed error with Ruby 2.7
8
+
9
+ ## 0.3.0 (2023-02-07)
10
+
11
+ - Added support for training tokenizers
12
+ - Added more methods to `Tokenizer`
13
+ - Added `encode_batch` method to `Encoding`
14
+ - Added `pair` argument to `encode` method
15
+ - Changed `encode` method to include special tokens by default
16
+ - Changed how offsets are calculated for strings with multibyte characters
17
+
18
+ ## 0.2.3 (2023-01-22)
19
+
20
+ - Added `add_special_tokens` option to `encode` method
21
+ - Added warning about `encode` method including special tokens by default in 0.3.0
22
+ - Added more methods to `Encoding`
23
+ - Fixed error with precompiled gem on Mac ARM
24
+
25
+ ## 0.2.2 (2023-01-15)
26
+
27
+ - Added precompiled gem for Linux ARM
28
+ - Added `from_file` method
29
+ - Fixed error with precompiled gem on Linux x86-64
30
+
31
+ ## 0.2.1 (2023-01-12)
32
+
33
+ - Added support for Ruby 3.2
34
+
35
+ ## 0.2.0 (2022-12-11)
36
+
37
+ - Added precompiled gems for Linux x86-64 and Mac
38
+ - Switched to `rb_sys` gem for building extension
39
+ - Updated Tokenizers to 0.13.2
40
+ - Updated Rust edition to 2021
41
+
42
+ ## 0.1.3 (2022-10-06)
43
+
44
+ - Updated Tokenizers to 0.13.1
45
+
46
+ ## 0.1.2 (2022-09-08)
47
+
48
+ - Fixed error with installation on Linux
49
+
50
+ ## 0.1.1 (2022-06-29)
51
+
52
+ - Fixed error with installation
53
+
54
+ ## 0.1.0 (2022-03-19)
55
+
56
+ - First release