tokenizers 0.5.1-aarch64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +95 -0
  3. data/Cargo.lock +895 -0
  4. data/Cargo.toml +6 -0
  5. data/LICENSE-THIRD-PARTY.txt +17104 -0
  6. data/LICENSE.txt +202 -0
  7. data/README.md +105 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/3.3/tokenizers.so +0 -0
  11. data/lib/tokenizers/char_bpe_tokenizer.rb +22 -0
  12. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  13. data/lib/tokenizers/decoders/ctc.rb +9 -0
  14. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  15. data/lib/tokenizers/decoders/strip.rb +9 -0
  16. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  17. data/lib/tokenizers/encoding.rb +19 -0
  18. data/lib/tokenizers/from_pretrained.rb +125 -0
  19. data/lib/tokenizers/models/bpe.rb +9 -0
  20. data/lib/tokenizers/models/unigram.rb +9 -0
  21. data/lib/tokenizers/models/word_level.rb +13 -0
  22. data/lib/tokenizers/models/word_piece.rb +9 -0
  23. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  24. data/lib/tokenizers/normalizers/prepend.rb +9 -0
  25. data/lib/tokenizers/normalizers/strip.rb +9 -0
  26. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  28. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  29. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  30. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  31. data/lib/tokenizers/processors/byte_level.rb +9 -0
  32. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  33. data/lib/tokenizers/processors/template_processing.rb +9 -0
  34. data/lib/tokenizers/tokenizer.rb +45 -0
  35. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  36. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  37. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  38. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  39. data/lib/tokenizers/version.rb +3 -0
  40. data/lib/tokenizers.rb +61 -0
  41. metadata +84 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b8c4ff155c60f5ed4f81ec5254c88179ed786149d2fe0698326c7a0c6be86cb2
4
+ data.tar.gz: 9f2ec69eeb9ec75e47d54213421b7dca8b7b26c21daad4f42a4475dfda5b1588
5
+ SHA512:
6
+ metadata.gz: 92f5b14b17ef2d41d45dfcba81202103a63ce3f6e2fd5d575519c1faa845de7b31ae28971bac36108bd0d2e0d4cc0a30b694fd48cfb093ca47604041e03e8719
7
+ data.tar.gz: 64eef8d1a8e7d7990085b1f8558e9abb17e501bc70dc35509a2b42034a4ddf6011eb9967f5288b994a6193e3e4e4f690292296e6c90d9825b0a924d7e41c39c3
data/CHANGELOG.md ADDED
@@ -0,0 +1,95 @@
1
+ ## 0.5.1 (2024-08-13)
2
+
3
+ - Updated Tokenizers to 0.20.0
4
+ - Added precompiled gem for Linux ARM MUSL
5
+
6
+ ## 0.5.0 (2024-05-21)
7
+
8
+ - Updated Tokenizers to 0.19.1
9
+ - Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
10
+ - Dropped support for Ruby < 3.1
11
+
12
+ ## 0.4.4 (2024-02-27)
13
+
14
+ - Updated Tokenizers to 0.15.2
15
+
16
+ ## 0.4.3 (2024-01-03)
17
+
18
+ - Added support for Ruby 3.3
19
+
20
+ ## 0.4.2 (2023-11-16)
21
+
22
+ - Updated Tokenizers to 0.15.0
23
+ - Fixed issue with download caching
24
+
25
+ ## 0.4.1 (2023-10-05)
26
+
27
+ - Fixed error loading gem
28
+
29
+ ## 0.4.0 (2023-09-20)
30
+
31
+ - Updated Tokenizers to 0.14.0
32
+ - Dropped support for Ruby < 3
33
+
34
+ ## 0.3.3 (2023-04-09)
35
+
36
+ - Updated Tokenizers to 0.13.3
37
+ - Added `ByteFallback`, `Fuse`, `Replace`, and `Strip` decoders
38
+ - Added `Prepend` normalizer
39
+
40
+ ## 0.3.2 (2023-03-06)
41
+
42
+ - Added precompiled gem for Linux x86-64 MUSL
43
+
44
+ ## 0.3.1 (2023-02-08)
45
+
46
+ - Fixed error with Ruby 2.7
47
+
48
+ ## 0.3.0 (2023-02-07)
49
+
50
+ - Added support for training tokenizers
51
+ - Added more methods to `Tokenizer`
52
+ - Added `encode_batch` method to `Encoding`
53
+ - Added `pair` argument to `encode` method
54
+ - Changed `encode` method to include special tokens by default
55
+ - Changed how offsets are calculated for strings with multibyte characters
56
+
57
+ ## 0.2.3 (2023-01-22)
58
+
59
+ - Added `add_special_tokens` option to `encode` method
60
+ - Added warning about `encode` method including special tokens by default in 0.3.0
61
+ - Added more methods to `Encoding`
62
+ - Fixed error with precompiled gem on Mac ARM
63
+
64
+ ## 0.2.2 (2023-01-15)
65
+
66
+ - Added precompiled gem for Linux ARM
67
+ - Added `from_file` method
68
+ - Fixed error with precompiled gem on Linux x86-64
69
+
70
+ ## 0.2.1 (2023-01-12)
71
+
72
+ - Added support for Ruby 3.2
73
+
74
+ ## 0.2.0 (2022-12-11)
75
+
76
+ - Added precompiled gems for Linux x86-64 and Mac
77
+ - Switched to `rb_sys` gem for building extension
78
+ - Updated Tokenizers to 0.13.2
79
+ - Updated Rust edition to 2021
80
+
81
+ ## 0.1.3 (2022-10-06)
82
+
83
+ - Updated Tokenizers to 0.13.1
84
+
85
+ ## 0.1.2 (2022-09-08)
86
+
87
+ - Fixed error with installation on Linux
88
+
89
+ ## 0.1.1 (2022-06-29)
90
+
91
+ - Fixed error with installation
92
+
93
+ ## 0.1.0 (2022-03-19)
94
+
95
+ - First release