tokenizers 0.5.3-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +107 -0
  3. data/Cargo.lock +898 -0
  4. data/Cargo.toml +6 -0
  5. data/LICENSE-THIRD-PARTY.txt +17427 -0
  6. data/LICENSE.txt +202 -0
  7. data/README.md +105 -0
  8. data/lib/tokenizers/3.1/tokenizers.so +0 -0
  9. data/lib/tokenizers/3.2/tokenizers.so +0 -0
  10. data/lib/tokenizers/3.3/tokenizers.so +0 -0
  11. data/lib/tokenizers/added_token.rb +7 -0
  12. data/lib/tokenizers/char_bpe_tokenizer.rb +22 -0
  13. data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
  14. data/lib/tokenizers/decoders/ctc.rb +9 -0
  15. data/lib/tokenizers/decoders/metaspace.rb +9 -0
  16. data/lib/tokenizers/decoders/strip.rb +9 -0
  17. data/lib/tokenizers/decoders/word_piece.rb +9 -0
  18. data/lib/tokenizers/encoding.rb +19 -0
  19. data/lib/tokenizers/from_pretrained.rb +125 -0
  20. data/lib/tokenizers/models/bpe.rb +9 -0
  21. data/lib/tokenizers/models/unigram.rb +9 -0
  22. data/lib/tokenizers/models/word_level.rb +13 -0
  23. data/lib/tokenizers/models/word_piece.rb +9 -0
  24. data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
  25. data/lib/tokenizers/normalizers/prepend.rb +9 -0
  26. data/lib/tokenizers/normalizers/strip.rb +9 -0
  27. data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
  28. data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
  29. data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
  30. data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
  31. data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
  32. data/lib/tokenizers/processors/byte_level.rb +9 -0
  33. data/lib/tokenizers/processors/roberta_processing.rb +9 -0
  34. data/lib/tokenizers/processors/template_processing.rb +9 -0
  35. data/lib/tokenizers/tokenizer.rb +45 -0
  36. data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
  37. data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
  38. data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
  39. data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
  40. data/lib/tokenizers/version.rb +3 -0
  41. data/lib/tokenizers.rb +62 -0
  42. metadata +85 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2575d80ae3967271482554c3a2e871dca95ce6545864a0222ac8571cd52aa5c9
4
+ data.tar.gz: 1970e935847b5ff6c2f4fa87ee0219796e1734eb8e777fca6f2a4c7c46108022
5
+ SHA512:
6
+ metadata.gz: 83c10038a44a00931c296cc6549cf5a1614446a094bfb9a87923fa57361e4c5eadfc52b81e27edfcd32d3b9e3ca8dde9d4a40d84df9c1844b16826a936784373
7
+ data.tar.gz: 6d7659d3928d03ffeb20632d3d0c44a34cda7882a96963e22d6e009b15c8c2935f3e45e879a515569b71e7dc524c5143b5220d76af46f322213399a94269c59f
data/CHANGELOG.md ADDED
@@ -0,0 +1,107 @@
1
+ ## 0.5.3 (2024-09-17)
2
+
3
+ - Added `AddedToken` class
4
+ - Added precompiled gem for Windows
5
+
6
+ ## 0.5.2 (2024-08-26)
7
+
8
+ - Added `from_str` method to `Tokenizer`
9
+ - Added `model` and `model=` methods to `Tokenizer`
10
+ - Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
11
+ - Added `decode` method to `Decoder`
12
+
13
+ ## 0.5.1 (2024-08-13)
14
+
15
+ - Updated Tokenizers to 0.20.0
16
+ - Added precompiled gem for Linux ARM MUSL
17
+
18
+ ## 0.5.0 (2024-05-21)
19
+
20
+ - Updated Tokenizers to 0.19.1
21
+ - Replaced `add_prefix_space` with `prepend_scheme` and `split` options for `Metaspace` decoder and pre-tokenizer
22
+ - Dropped support for Ruby < 3.1
23
+
24
+ ## 0.4.4 (2024-02-27)
25
+
26
+ - Updated Tokenizers to 0.15.2
27
+
28
+ ## 0.4.3 (2024-01-03)
29
+
30
+ - Added support for Ruby 3.3
31
+
32
+ ## 0.4.2 (2023-11-16)
33
+
34
+ - Updated Tokenizers to 0.15.0
35
+ - Fixed issue with download caching
36
+
37
+ ## 0.4.1 (2023-10-05)
38
+
39
+ - Fixed error loading gem
40
+
41
+ ## 0.4.0 (2023-09-20)
42
+
43
+ - Updated Tokenizers to 0.14.0
44
+ - Dropped support for Ruby < 3
45
+
46
+ ## 0.3.3 (2023-04-09)
47
+
48
+ - Updated Tokenizers to 0.13.3
49
+ - Added `ByteFallback`, `Fuse`, `Replace`, and `Strip` decoders
50
+ - Added `Prepend` normalizer
51
+
52
+ ## 0.3.2 (2023-03-06)
53
+
54
+ - Added precompiled gem for Linux x86-64 MUSL
55
+
56
+ ## 0.3.1 (2023-02-08)
57
+
58
+ - Fixed error with Ruby 2.7
59
+
60
+ ## 0.3.0 (2023-02-07)
61
+
62
+ - Added support for training tokenizers
63
+ - Added more methods to `Tokenizer`
64
+ - Added `encode_batch` method to `Encoding`
65
+ - Added `pair` argument to `encode` method
66
+ - Changed `encode` method to include special tokens by default
67
+ - Changed how offsets are calculated for strings with multibyte characters
68
+
69
+ ## 0.2.3 (2023-01-22)
70
+
71
+ - Added `add_special_tokens` option to `encode` method
72
+ - Added warning about `encode` method including special tokens by default in 0.3.0
73
+ - Added more methods to `Encoding`
74
+ - Fixed error with precompiled gem on Mac ARM
75
+
76
+ ## 0.2.2 (2023-01-15)
77
+
78
+ - Added precompiled gem for Linux ARM
79
+ - Added `from_file` method
80
+ - Fixed error with precompiled gem on Linux x86-64
81
+
82
+ ## 0.2.1 (2023-01-12)
83
+
84
+ - Added support for Ruby 3.2
85
+
86
+ ## 0.2.0 (2022-12-11)
87
+
88
+ - Added precompiled gems for Linux x86-64 and Mac
89
+ - Switched to `rb_sys` gem for building extension
90
+ - Updated Tokenizers to 0.13.2
91
+ - Updated Rust edition to 2021
92
+
93
+ ## 0.1.3 (2022-10-06)
94
+
95
+ - Updated Tokenizers to 0.13.1
96
+
97
+ ## 0.1.2 (2022-09-08)
98
+
99
+ - Fixed error with installation on Linux
100
+
101
+ ## 0.1.1 (2022-06-29)
102
+
103
+ - Fixed error with installation
104
+
105
+ ## 0.1.0 (2022-03-19)
106
+
107
+ - First release