tokenizers 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
4
- data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
3
+ metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
4
+ data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
5
5
  SHA512:
6
- metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
7
- data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
6
+ metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
7
+ data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.4.4 (2024-02-27)
2
+
3
+ - Updated Tokenizers to 0.15.2
4
+
1
5
  ## 0.4.3 (2024-01-03)
2
6
 
3
7
  - Added support for Ruby 3.3
data/Cargo.lock CHANGED
@@ -280,6 +280,15 @@ dependencies = [
280
280
  "either",
281
281
  ]
282
282
 
283
+ [[package]]
284
+ name = "itertools"
285
+ version = "0.12.1"
286
+ source = "registry+https://github.com/rust-lang/crates.io-index"
287
+ checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
288
+ dependencies = [
289
+ "either",
290
+ ]
291
+
283
292
  [[package]]
284
293
  name = "itoa"
285
294
  version = "1.0.6"
@@ -543,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
543
552
  checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
544
553
  dependencies = [
545
554
  "either",
546
- "itertools",
555
+ "itertools 0.11.0",
547
556
  "rayon",
548
557
  ]
549
558
 
@@ -559,18 +568,18 @@ dependencies = [
559
568
 
560
569
  [[package]]
561
570
  name = "rb-sys"
562
- version = "0.9.86"
571
+ version = "0.9.89"
563
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
564
- checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
573
+ checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
565
574
  dependencies = [
566
575
  "rb-sys-build",
567
576
  ]
568
577
 
569
578
  [[package]]
570
579
  name = "rb-sys-build"
571
- version = "0.9.86"
580
+ version = "0.9.89"
572
581
  source = "registry+https://github.com/rust-lang/crates.io-index"
573
- checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
582
+ checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
574
583
  dependencies = [
575
584
  "bindgen",
576
585
  "lazy_static",
@@ -596,7 +605,7 @@ dependencies = [
596
605
  "aho-corasick",
597
606
  "memchr",
598
607
  "regex-automata",
599
- "regex-syntax",
608
+ "regex-syntax 0.7.5",
600
609
  ]
601
610
 
602
611
  [[package]]
@@ -607,7 +616,7 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
607
616
  dependencies = [
608
617
  "aho-corasick",
609
618
  "memchr",
610
- "regex-syntax",
619
+ "regex-syntax 0.7.5",
611
620
  ]
612
621
 
613
622
  [[package]]
@@ -616,6 +625,12 @@ version = "0.7.5"
616
625
  source = "registry+https://github.com/rust-lang/crates.io-index"
617
626
  checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
618
627
 
628
+ [[package]]
629
+ name = "regex-syntax"
630
+ version = "0.8.2"
631
+ source = "registry+https://github.com/rust-lang/crates.io-index"
632
+ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
633
+
619
634
  [[package]]
620
635
  name = "rustc-hash"
621
636
  version = "1.1.0"
@@ -751,26 +766,26 @@ dependencies = [
751
766
 
752
767
  [[package]]
753
768
  name = "tokenizers"
754
- version = "0.4.3"
769
+ version = "0.4.4"
755
770
  dependencies = [
756
771
  "magnus",
757
772
  "onig",
758
773
  "serde",
759
- "tokenizers 0.15.0",
774
+ "tokenizers 0.15.2",
760
775
  ]
761
776
 
762
777
  [[package]]
763
778
  name = "tokenizers"
764
- version = "0.15.0"
779
+ version = "0.15.2"
765
780
  source = "registry+https://github.com/rust-lang/crates.io-index"
766
- checksum = "062b8a9613d6017633b80fb55fbb33f1aff006c36225a3025630753398034b3c"
781
+ checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
767
782
  dependencies = [
768
783
  "aho-corasick",
769
784
  "derive_builder",
770
785
  "esaxx-rs",
771
786
  "getrandom",
772
787
  "indicatif",
773
- "itertools",
788
+ "itertools 0.12.1",
774
789
  "lazy_static",
775
790
  "log",
776
791
  "macro_rules_attribute",
@@ -781,7 +796,7 @@ dependencies = [
781
796
  "rayon",
782
797
  "rayon-cond",
783
798
  "regex",
784
- "regex-syntax",
799
+ "regex-syntax 0.8.2",
785
800
  "serde",
786
801
  "serde_json",
787
802
  "spm_precompiled",
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/tokenizers-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tokenizers-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/tokenizers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tokenizers-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.3"
3
+ version = "0.4.4"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
16
16
  serde = { version = "1", features = ["rc", "derive"] }
17
17
 
18
18
  [dependencies.tokenizers]
19
- version = "=0.15.0" # also update in from_pretrained.rb
19
+ version = "=0.15.2" # also update in from_pretrained.rb
20
20
  default-features = false
21
21
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -77,11 +77,11 @@ impl RbTrainer {
77
77
  setter!(self, BpeTrainer, vocab_size, vocab_size);
78
78
  }
79
79
 
80
- fn bpe_trainer_min_frequency(&self) -> u32 {
80
+ fn bpe_trainer_min_frequency(&self) -> u64 {
81
81
  getter!(self, BpeTrainer, min_frequency)
82
82
  }
83
83
 
84
- fn bpe_trainer_set_min_frequency(&self, freq: u32) {
84
+ fn bpe_trainer_set_min_frequency(&self, freq: u64) {
85
85
  setter!(self, BpeTrainer, min_frequency, freq);
86
86
  }
87
87
 
@@ -235,11 +235,11 @@ impl RbTrainer {
235
235
  setter!(self, WordLevelTrainer, vocab_size, vocab_size);
236
236
  }
237
237
 
238
- fn word_level_trainer_min_frequency(&self) -> u32 {
238
+ fn word_level_trainer_min_frequency(&self) -> u64 {
239
239
  getter!(self, WordLevelTrainer, min_frequency)
240
240
  }
241
241
 
242
- fn word_level_trainer_set_min_frequency(&self, freq: u32) {
242
+ fn word_level_trainer_set_min_frequency(&self, freq: u64) {
243
243
  setter!(self, WordLevelTrainer, min_frequency, freq);
244
244
  }
245
245
 
@@ -289,11 +289,11 @@ impl RbTrainer {
289
289
  setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
290
290
  }
291
291
 
292
- fn word_piece_trainer_min_frequency(&self) -> u32 {
292
+ fn word_piece_trainer_min_frequency(&self) -> u64 {
293
293
  getter!(self, WordPieceTrainer, min_frequency())
294
294
  }
295
295
 
296
- fn word_piece_trainer_set_min_frequency(&self, freq: u32) {
296
+ fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
297
297
  setter!(self, WordPieceTrainer, @set_min_frequency, freq);
298
298
  }
299
299
 
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.15.0"
4
+ TOKENIZERS_VERSION = "0.15.2"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.3"
2
+ VERSION = "0.4.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-01-04 00:00:00.000000000 Z
11
+ date: 2024-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys