tokenizers 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 954a1e29eea94d08df1d0a0c667d4554f0f979947e2c714114f5aa16db19ad11
4
- data.tar.gz: 910f44e5c05115dce6ee1fe1070138ba1a826a1142a5f43ff104990687cc7814
3
+ metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
4
+ data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
5
5
  SHA512:
6
- metadata.gz: 8d88bcfaacae964414ad175f4f27f4e20bdf8131a4a97deabdc732f75c12ca2ad956ff3c3f0967d607b46748a8abe7fc1c94538ba9e9ddaa3abaff7132ec15ca
7
- data.tar.gz: 95c8e8225da0070aa947ac8c6436e90f490e5dff033f532f3c2d61af63c7c5beff36398b6a78b4c6e69eca7c4d64834ea43d8c988e3ccdae8ee30e0c84f126bd
6
+ metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
7
+ data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## 0.4.4 (2024-02-27)
2
+
3
+ - Updated Tokenizers to 0.15.2
4
+
5
+ ## 0.4.3 (2024-01-03)
6
+
7
+ - Added support for Ruby 3.3
8
+
1
9
  ## 0.4.2 (2023-11-16)
2
10
 
3
11
  - Updated Tokenizers to 0.15.0
data/Cargo.lock CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
25
25
 
26
26
  [[package]]
27
27
  name = "bindgen"
28
- version = "0.62.0"
28
+ version = "0.69.1"
29
29
  source = "registry+https://github.com/rust-lang/crates.io-index"
30
- checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
30
+ checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
31
31
  dependencies = [
32
- "bitflags",
32
+ "bitflags 2.4.1",
33
33
  "cexpr",
34
34
  "clang-sys",
35
35
  "lazy_static",
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "regex",
41
41
  "rustc-hash",
42
42
  "shlex",
43
- "syn 1.0.109",
43
+ "syn 2.0.38",
44
44
  ]
45
45
 
46
46
  [[package]]
@@ -49,6 +49,12 @@ version = "1.3.2"
49
49
  source = "registry+https://github.com/rust-lang/crates.io-index"
50
50
  checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
51
51
 
52
+ [[package]]
53
+ name = "bitflags"
54
+ version = "2.4.1"
55
+ source = "registry+https://github.com/rust-lang/crates.io-index"
56
+ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
57
+
52
58
  [[package]]
53
59
  name = "cc"
54
60
  version = "1.0.79"
@@ -274,6 +280,15 @@ dependencies = [
274
280
  "either",
275
281
  ]
276
282
 
283
+ [[package]]
284
+ name = "itertools"
285
+ version = "0.12.1"
286
+ source = "registry+https://github.com/rust-lang/crates.io-index"
287
+ checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
288
+ dependencies = [
289
+ "either",
290
+ ]
291
+
277
292
  [[package]]
278
293
  name = "itoa"
279
294
  version = "1.0.6"
@@ -335,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
335
350
 
336
351
  [[package]]
337
352
  name = "magnus"
338
- version = "0.6.0"
353
+ version = "0.6.2"
339
354
  source = "registry+https://github.com/rust-lang/crates.io-index"
340
- checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
355
+ checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
341
356
  dependencies = [
342
357
  "magnus-macros",
343
358
  "rb-sys",
@@ -426,7 +441,7 @@ version = "6.4.0"
426
441
  source = "registry+https://github.com/rust-lang/crates.io-index"
427
442
  checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
428
443
  dependencies = [
429
- "bitflags",
444
+ "bitflags 1.3.2",
430
445
  "libc",
431
446
  "once_cell",
432
447
  "onig_sys",
@@ -537,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
537
552
  checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
538
553
  dependencies = [
539
554
  "either",
540
- "itertools",
555
+ "itertools 0.11.0",
541
556
  "rayon",
542
557
  ]
543
558
 
@@ -553,18 +568,18 @@ dependencies = [
553
568
 
554
569
  [[package]]
555
570
  name = "rb-sys"
556
- version = "0.9.79"
571
+ version = "0.9.89"
557
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
558
- checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
573
+ checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
559
574
  dependencies = [
560
575
  "rb-sys-build",
561
576
  ]
562
577
 
563
578
  [[package]]
564
579
  name = "rb-sys-build"
565
- version = "0.9.79"
580
+ version = "0.9.89"
566
581
  source = "registry+https://github.com/rust-lang/crates.io-index"
567
- checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
582
+ checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
568
583
  dependencies = [
569
584
  "bindgen",
570
585
  "lazy_static",
@@ -572,7 +587,7 @@ dependencies = [
572
587
  "quote",
573
588
  "regex",
574
589
  "shell-words",
575
- "syn 1.0.109",
590
+ "syn 2.0.38",
576
591
  ]
577
592
 
578
593
  [[package]]
@@ -590,7 +605,7 @@ dependencies = [
590
605
  "aho-corasick",
591
606
  "memchr",
592
607
  "regex-automata",
593
- "regex-syntax",
608
+ "regex-syntax 0.7.5",
594
609
  ]
595
610
 
596
611
  [[package]]
@@ -601,7 +616,7 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
601
616
  dependencies = [
602
617
  "aho-corasick",
603
618
  "memchr",
604
- "regex-syntax",
619
+ "regex-syntax 0.7.5",
605
620
  ]
606
621
 
607
622
  [[package]]
@@ -610,6 +625,12 @@ version = "0.7.5"
610
625
  source = "registry+https://github.com/rust-lang/crates.io-index"
611
626
  checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
612
627
 
628
+ [[package]]
629
+ name = "regex-syntax"
630
+ version = "0.8.2"
631
+ source = "registry+https://github.com/rust-lang/crates.io-index"
632
+ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
633
+
613
634
  [[package]]
614
635
  name = "rustc-hash"
615
636
  version = "1.1.0"
@@ -745,26 +766,26 @@ dependencies = [
745
766
 
746
767
  [[package]]
747
768
  name = "tokenizers"
748
- version = "0.4.2"
769
+ version = "0.4.4"
749
770
  dependencies = [
750
771
  "magnus",
751
772
  "onig",
752
773
  "serde",
753
- "tokenizers 0.15.0",
774
+ "tokenizers 0.15.2",
754
775
  ]
755
776
 
756
777
  [[package]]
757
778
  name = "tokenizers"
758
- version = "0.15.0"
779
+ version = "0.15.2"
759
780
  source = "registry+https://github.com/rust-lang/crates.io-index"
760
- checksum = "062b8a9613d6017633b80fb55fbb33f1aff006c36225a3025630753398034b3c"
781
+ checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
761
782
  dependencies = [
762
783
  "aho-corasick",
763
784
  "derive_builder",
764
785
  "esaxx-rs",
765
786
  "getrandom",
766
787
  "indicatif",
767
- "itertools",
788
+ "itertools 0.12.1",
768
789
  "lazy_static",
769
790
  "log",
770
791
  "macro_rules_attribute",
@@ -775,7 +796,7 @@ dependencies = [
775
796
  "rayon",
776
797
  "rayon-cond",
777
798
  "regex",
778
- "regex-syntax",
799
+ "regex-syntax 0.8.2",
779
800
  "serde",
780
801
  "serde_json",
781
802
  "spm_precompiled",
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/tokenizers-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tokenizers-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/tokenizers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tokenizers-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -34,15 +34,51 @@ Decode
34
34
  tokenizer.decode(ids)
35
35
  ```
36
36
 
37
- Load a tokenizer from files
37
+ ## Training
38
+
39
+ Create a tokenizer
38
40
 
39
41
  ```ruby
40
- tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
42
+ tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
41
43
  ```
42
44
 
43
- ## Training
45
+ Set the pre-tokenizer
46
+
47
+ ```ruby
48
+ tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
49
+ ```
50
+
51
+ Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
52
+
53
+ ```ruby
54
+ trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
55
+ tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
56
+ ```
57
+
58
+ Encode
59
+
60
+ ```ruby
61
+ output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
62
+ output.tokens
63
+ ```
64
+
65
+ Save the tokenizer to a file
66
+
67
+ ```ruby
68
+ tokenizer.save("tokenizer.json")
69
+ ```
70
+
71
+ Load a tokenizer from a file
72
+
73
+ ```ruby
74
+ tokenizer = Tokenizers.from_file("tokenizer.json")
75
+ ```
76
+
77
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
78
+
79
+ ## API
44
80
 
45
- Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
81
+ This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
46
82
 
47
83
  ## History
48
84
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.2"
3
+ version = "0.4.4"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.0" # also update in from_pretrained.rb
19
+ version = "=0.15.2" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -77,11 +77,11 @@ impl RbTrainer {
77
77
  setter!(self, BpeTrainer, vocab_size, vocab_size);
78
78
  }
79
79
 
80
- fn bpe_trainer_min_frequency(&self) -> u32 {
80
+ fn bpe_trainer_min_frequency(&self) -> u64 {
81
81
  getter!(self, BpeTrainer, min_frequency)
82
82
  }
83
83
 
84
- fn bpe_trainer_set_min_frequency(&self, freq: u32) {
84
+ fn bpe_trainer_set_min_frequency(&self, freq: u64) {
85
85
  setter!(self, BpeTrainer, min_frequency, freq);
86
86
  }
87
87
 
@@ -235,11 +235,11 @@ impl RbTrainer {
235
235
  setter!(self, WordLevelTrainer, vocab_size, vocab_size);
236
236
  }
237
237
 
238
- fn word_level_trainer_min_frequency(&self) -> u32 {
238
+ fn word_level_trainer_min_frequency(&self) -> u64 {
239
239
  getter!(self, WordLevelTrainer, min_frequency)
240
240
  }
241
241
 
242
- fn word_level_trainer_set_min_frequency(&self, freq: u32) {
242
+ fn word_level_trainer_set_min_frequency(&self, freq: u64) {
243
243
  setter!(self, WordLevelTrainer, min_frequency, freq);
244
244
  }
245
245
 
@@ -289,11 +289,11 @@ impl RbTrainer {
289
289
  setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
290
290
  }
291
291
 
292
- fn word_piece_trainer_min_frequency(&self) -> u32 {
292
+ fn word_piece_trainer_min_frequency(&self) -> u64 {
293
293
  getter!(self, WordPieceTrainer, min_frequency())
294
294
  }
295
295
 
296
- fn word_piece_trainer_set_min_frequency(&self, freq: u32) {
296
+ fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
297
297
  setter!(self, WordPieceTrainer, @set_min_frequency, freq);
298
298
  }
299
299
 
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.0"
4
+ TOKENIZERS_VERSION = "0.15.2"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.2"
2
+ VERSION = "0.4.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-16 00:00:00.000000000 Z
11
+ date: 2024-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.4.10
103
+ rubygems_version: 3.5.3
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby