tokenizers 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 954a1e29eea94d08df1d0a0c667d4554f0f979947e2c714114f5aa16db19ad11
4
- data.tar.gz: 910f44e5c05115dce6ee1fe1070138ba1a826a1142a5f43ff104990687cc7814
3
+ metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
4
+ data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
5
5
  SHA512:
6
- metadata.gz: 8d88bcfaacae964414ad175f4f27f4e20bdf8131a4a97deabdc732f75c12ca2ad956ff3c3f0967d607b46748a8abe7fc1c94538ba9e9ddaa3abaff7132ec15ca
7
- data.tar.gz: 95c8e8225da0070aa947ac8c6436e90f490e5dff033f532f3c2d61af63c7c5beff36398b6a78b4c6e69eca7c4d64834ea43d8c988e3ccdae8ee30e0c84f126bd
6
+ metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
7
+ data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## 0.4.4 (2024-02-27)
2
+
3
+ - Updated Tokenizers to 0.15.2
4
+
5
+ ## 0.4.3 (2024-01-03)
6
+
7
+ - Added support for Ruby 3.3
8
+
1
9
  ## 0.4.2 (2023-11-16)
2
10
 
3
11
  - Updated Tokenizers to 0.15.0
data/Cargo.lock CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
25
25
 
26
26
  [[package]]
27
27
  name = "bindgen"
28
- version = "0.62.0"
28
+ version = "0.69.1"
29
29
  source = "registry+https://github.com/rust-lang/crates.io-index"
30
- checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
30
+ checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
31
31
  dependencies = [
32
- "bitflags",
32
+ "bitflags 2.4.1",
33
33
  "cexpr",
34
34
  "clang-sys",
35
35
  "lazy_static",
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "regex",
41
41
  "rustc-hash",
42
42
  "shlex",
43
- "syn 1.0.109",
43
+ "syn 2.0.38",
44
44
  ]
45
45
 
46
46
  [[package]]
@@ -49,6 +49,12 @@ version = "1.3.2"
49
49
  source = "registry+https://github.com/rust-lang/crates.io-index"
50
50
  checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
51
51
 
52
+ [[package]]
53
+ name = "bitflags"
54
+ version = "2.4.1"
55
+ source = "registry+https://github.com/rust-lang/crates.io-index"
56
+ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
57
+
52
58
  [[package]]
53
59
  name = "cc"
54
60
  version = "1.0.79"
@@ -274,6 +280,15 @@ dependencies = [
274
280
  "either",
275
281
  ]
276
282
 
283
+ [[package]]
284
+ name = "itertools"
285
+ version = "0.12.1"
286
+ source = "registry+https://github.com/rust-lang/crates.io-index"
287
+ checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
288
+ dependencies = [
289
+ "either",
290
+ ]
291
+
277
292
  [[package]]
278
293
  name = "itoa"
279
294
  version = "1.0.6"
@@ -335,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
335
350
 
336
351
  [[package]]
337
352
  name = "magnus"
338
- version = "0.6.0"
353
+ version = "0.6.2"
339
354
  source = "registry+https://github.com/rust-lang/crates.io-index"
340
- checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
355
+ checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
341
356
  dependencies = [
342
357
  "magnus-macros",
343
358
  "rb-sys",
@@ -426,7 +441,7 @@ version = "6.4.0"
426
441
  source = "registry+https://github.com/rust-lang/crates.io-index"
427
442
  checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
428
443
  dependencies = [
429
- "bitflags",
444
+ "bitflags 1.3.2",
430
445
  "libc",
431
446
  "once_cell",
432
447
  "onig_sys",
@@ -537,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
537
552
  checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
538
553
  dependencies = [
539
554
  "either",
540
- "itertools",
555
+ "itertools 0.11.0",
541
556
  "rayon",
542
557
  ]
543
558
 
@@ -553,18 +568,18 @@ dependencies = [
553
568
 
554
569
  [[package]]
555
570
  name = "rb-sys"
556
- version = "0.9.79"
571
+ version = "0.9.89"
557
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
558
- checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
573
+ checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
559
574
  dependencies = [
560
575
  "rb-sys-build",
561
576
  ]
562
577
 
563
578
  [[package]]
564
579
  name = "rb-sys-build"
565
- version = "0.9.79"
580
+ version = "0.9.89"
566
581
  source = "registry+https://github.com/rust-lang/crates.io-index"
567
- checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
582
+ checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
568
583
  dependencies = [
569
584
  "bindgen",
570
585
  "lazy_static",
@@ -572,7 +587,7 @@ dependencies = [
572
587
  "quote",
573
588
  "regex",
574
589
  "shell-words",
575
- "syn 1.0.109",
590
+ "syn 2.0.38",
576
591
  ]
577
592
 
578
593
  [[package]]
@@ -590,7 +605,7 @@ dependencies = [
590
605
  "aho-corasick",
591
606
  "memchr",
592
607
  "regex-automata",
593
- "regex-syntax",
608
+ "regex-syntax 0.7.5",
594
609
  ]
595
610
 
596
611
  [[package]]
@@ -601,7 +616,7 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
601
616
  dependencies = [
602
617
  "aho-corasick",
603
618
  "memchr",
604
- "regex-syntax",
619
+ "regex-syntax 0.7.5",
605
620
  ]
606
621
 
607
622
  [[package]]
@@ -610,6 +625,12 @@ version = "0.7.5"
610
625
  source = "registry+https://github.com/rust-lang/crates.io-index"
611
626
  checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
612
627
 
628
+ [[package]]
629
+ name = "regex-syntax"
630
+ version = "0.8.2"
631
+ source = "registry+https://github.com/rust-lang/crates.io-index"
632
+ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
633
+
613
634
  [[package]]
614
635
  name = "rustc-hash"
615
636
  version = "1.1.0"
@@ -745,26 +766,26 @@ dependencies = [
745
766
 
746
767
  [[package]]
747
768
  name = "tokenizers"
748
- version = "0.4.2"
769
+ version = "0.4.4"
749
770
  dependencies = [
750
771
  "magnus",
751
772
  "onig",
752
773
  "serde",
753
- "tokenizers 0.15.0",
774
+ "tokenizers 0.15.2",
754
775
  ]
755
776
 
756
777
  [[package]]
757
778
  name = "tokenizers"
758
- version = "0.15.0"
779
+ version = "0.15.2"
759
780
  source = "registry+https://github.com/rust-lang/crates.io-index"
760
- checksum = "062b8a9613d6017633b80fb55fbb33f1aff006c36225a3025630753398034b3c"
781
+ checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
761
782
  dependencies = [
762
783
  "aho-corasick",
763
784
  "derive_builder",
764
785
  "esaxx-rs",
765
786
  "getrandom",
766
787
  "indicatif",
767
- "itertools",
788
+ "itertools 0.12.1",
768
789
  "lazy_static",
769
790
  "log",
770
791
  "macro_rules_attribute",
@@ -775,7 +796,7 @@ dependencies = [
775
796
  "rayon",
776
797
  "rayon-cond",
777
798
  "regex",
778
- "regex-syntax",
799
+ "regex-syntax 0.8.2",
779
800
  "serde",
780
801
  "serde_json",
781
802
  "spm_precompiled",
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/tokenizers-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tokenizers-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/tokenizers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tokenizers-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -34,15 +34,51 @@ Decode
34
34
  tokenizer.decode(ids)
35
35
  ```
36
36
 
37
- Load a tokenizer from files
37
+ ## Training
38
+
39
+ Create a tokenizer
38
40
 
39
41
  ```ruby
40
- tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
42
+ tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
41
43
  ```
42
44
 
43
- ## Training
45
+ Set the pre-tokenizer
46
+
47
+ ```ruby
48
+ tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
49
+ ```
50
+
51
+ Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
52
+
53
+ ```ruby
54
+ trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
55
+ tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
56
+ ```
57
+
58
+ Encode
59
+
60
+ ```ruby
61
+ output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
62
+ output.tokens
63
+ ```
64
+
65
+ Save the tokenizer to a file
66
+
67
+ ```ruby
68
+ tokenizer.save("tokenizer.json")
69
+ ```
70
+
71
+ Load a tokenizer from a file
72
+
73
+ ```ruby
74
+ tokenizer = Tokenizers.from_file("tokenizer.json")
75
+ ```
76
+
77
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
78
+
79
+ ## API
44
80
 
45
- Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
81
+ This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
46
82
 
47
83
  ## History
48
84
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.2"
3
+ version = "0.4.4"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.0" # also update in from_pretrained.rb
19
+ version = "=0.15.2" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -77,11 +77,11 @@ impl RbTrainer {
77
77
  setter!(self, BpeTrainer, vocab_size, vocab_size);
78
78
  }
79
79
 
80
- fn bpe_trainer_min_frequency(&self) -> u32 {
80
+ fn bpe_trainer_min_frequency(&self) -> u64 {
81
81
  getter!(self, BpeTrainer, min_frequency)
82
82
  }
83
83
 
84
- fn bpe_trainer_set_min_frequency(&self, freq: u32) {
84
+ fn bpe_trainer_set_min_frequency(&self, freq: u64) {
85
85
  setter!(self, BpeTrainer, min_frequency, freq);
86
86
  }
87
87
 
@@ -235,11 +235,11 @@ impl RbTrainer {
235
235
  setter!(self, WordLevelTrainer, vocab_size, vocab_size);
236
236
  }
237
237
 
238
- fn word_level_trainer_min_frequency(&self) -> u32 {
238
+ fn word_level_trainer_min_frequency(&self) -> u64 {
239
239
  getter!(self, WordLevelTrainer, min_frequency)
240
240
  }
241
241
 
242
- fn word_level_trainer_set_min_frequency(&self, freq: u32) {
242
+ fn word_level_trainer_set_min_frequency(&self, freq: u64) {
243
243
  setter!(self, WordLevelTrainer, min_frequency, freq);
244
244
  }
245
245
 
@@ -289,11 +289,11 @@ impl RbTrainer {
289
289
  setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
290
290
  }
291
291
 
292
- fn word_piece_trainer_min_frequency(&self) -> u32 {
292
+ fn word_piece_trainer_min_frequency(&self) -> u64 {
293
293
  getter!(self, WordPieceTrainer, min_frequency())
294
294
  }
295
295
 
296
- fn word_piece_trainer_set_min_frequency(&self, freq: u32) {
296
+ fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
297
297
  setter!(self, WordPieceTrainer, @set_min_frequency, freq);
298
298
  }
299
299
 
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.0"
4
+ TOKENIZERS_VERSION = "0.15.2"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.2"
2
+ VERSION = "0.4.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-16 00:00:00.000000000 Z
11
+ date: 2024-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.4.10
103
+ rubygems_version: 3.5.3
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby