tokenizers 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +28 -13
- data/README.md +1 -1
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/trainers.rs +6 -6
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
|
|
4
|
+
data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
|
|
7
|
+
data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
|
@@ -280,6 +280,15 @@ dependencies = [
|
|
|
280
280
|
"either",
|
|
281
281
|
]
|
|
282
282
|
|
|
283
|
+
[[package]]
|
|
284
|
+
name = "itertools"
|
|
285
|
+
version = "0.12.1"
|
|
286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
287
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
|
288
|
+
dependencies = [
|
|
289
|
+
"either",
|
|
290
|
+
]
|
|
291
|
+
|
|
283
292
|
[[package]]
|
|
284
293
|
name = "itoa"
|
|
285
294
|
version = "1.0.6"
|
|
@@ -543,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
543
552
|
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
|
|
544
553
|
dependencies = [
|
|
545
554
|
"either",
|
|
546
|
-
"itertools",
|
|
555
|
+
"itertools 0.11.0",
|
|
547
556
|
"rayon",
|
|
548
557
|
]
|
|
549
558
|
|
|
@@ -559,18 +568,18 @@ dependencies = [
|
|
|
559
568
|
|
|
560
569
|
[[package]]
|
|
561
570
|
name = "rb-sys"
|
|
562
|
-
version = "0.9.
|
|
571
|
+
version = "0.9.89"
|
|
563
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
564
|
-
checksum = "
|
|
573
|
+
checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
|
|
565
574
|
dependencies = [
|
|
566
575
|
"rb-sys-build",
|
|
567
576
|
]
|
|
568
577
|
|
|
569
578
|
[[package]]
|
|
570
579
|
name = "rb-sys-build"
|
|
571
|
-
version = "0.9.
|
|
580
|
+
version = "0.9.89"
|
|
572
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
573
|
-
checksum = "
|
|
582
|
+
checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
|
|
574
583
|
dependencies = [
|
|
575
584
|
"bindgen",
|
|
576
585
|
"lazy_static",
|
|
@@ -596,7 +605,7 @@ dependencies = [
|
|
|
596
605
|
"aho-corasick",
|
|
597
606
|
"memchr",
|
|
598
607
|
"regex-automata",
|
|
599
|
-
"regex-syntax",
|
|
608
|
+
"regex-syntax 0.7.5",
|
|
600
609
|
]
|
|
601
610
|
|
|
602
611
|
[[package]]
|
|
@@ -607,7 +616,7 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
|
|
|
607
616
|
dependencies = [
|
|
608
617
|
"aho-corasick",
|
|
609
618
|
"memchr",
|
|
610
|
-
"regex-syntax",
|
|
619
|
+
"regex-syntax 0.7.5",
|
|
611
620
|
]
|
|
612
621
|
|
|
613
622
|
[[package]]
|
|
@@ -616,6 +625,12 @@ version = "0.7.5"
|
|
|
616
625
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
617
626
|
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
|
618
627
|
|
|
628
|
+
[[package]]
|
|
629
|
+
name = "regex-syntax"
|
|
630
|
+
version = "0.8.2"
|
|
631
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
632
|
+
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
|
633
|
+
|
|
619
634
|
[[package]]
|
|
620
635
|
name = "rustc-hash"
|
|
621
636
|
version = "1.1.0"
|
|
@@ -751,26 +766,26 @@ dependencies = [
|
|
|
751
766
|
|
|
752
767
|
[[package]]
|
|
753
768
|
name = "tokenizers"
|
|
754
|
-
version = "0.4.
|
|
769
|
+
version = "0.4.4"
|
|
755
770
|
dependencies = [
|
|
756
771
|
"magnus",
|
|
757
772
|
"onig",
|
|
758
773
|
"serde",
|
|
759
|
-
"tokenizers 0.15.
|
|
774
|
+
"tokenizers 0.15.2",
|
|
760
775
|
]
|
|
761
776
|
|
|
762
777
|
[[package]]
|
|
763
778
|
name = "tokenizers"
|
|
764
|
-
version = "0.15.
|
|
779
|
+
version = "0.15.2"
|
|
765
780
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
766
|
-
checksum = "
|
|
781
|
+
checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
|
|
767
782
|
dependencies = [
|
|
768
783
|
"aho-corasick",
|
|
769
784
|
"derive_builder",
|
|
770
785
|
"esaxx-rs",
|
|
771
786
|
"getrandom",
|
|
772
787
|
"indicatif",
|
|
773
|
-
"itertools",
|
|
788
|
+
"itertools 0.12.1",
|
|
774
789
|
"lazy_static",
|
|
775
790
|
"log",
|
|
776
791
|
"macro_rules_attribute",
|
|
@@ -781,7 +796,7 @@ dependencies = [
|
|
|
781
796
|
"rayon",
|
|
782
797
|
"rayon-cond",
|
|
783
798
|
"regex",
|
|
784
|
-
"regex-syntax",
|
|
799
|
+
"regex-syntax 0.8.2",
|
|
785
800
|
"serde",
|
|
786
801
|
"serde_json",
|
|
787
802
|
"spm_precompiled",
|
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
:slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
|
|
4
4
|
|
|
5
|
-
[](https://github.com/ankane/tokenizers-ruby/actions)
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.4"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
|
17
17
|
|
|
18
18
|
[dependencies.tokenizers]
|
|
19
|
-
version = "=0.15.
|
|
19
|
+
version = "=0.15.2" # also update in from_pretrained.rb
|
|
20
20
|
default-features = false
|
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -77,11 +77,11 @@ impl RbTrainer {
|
|
|
77
77
|
setter!(self, BpeTrainer, vocab_size, vocab_size);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
fn bpe_trainer_min_frequency(&self) ->
|
|
80
|
+
fn bpe_trainer_min_frequency(&self) -> u64 {
|
|
81
81
|
getter!(self, BpeTrainer, min_frequency)
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
-
fn bpe_trainer_set_min_frequency(&self, freq:
|
|
84
|
+
fn bpe_trainer_set_min_frequency(&self, freq: u64) {
|
|
85
85
|
setter!(self, BpeTrainer, min_frequency, freq);
|
|
86
86
|
}
|
|
87
87
|
|
|
@@ -235,11 +235,11 @@ impl RbTrainer {
|
|
|
235
235
|
setter!(self, WordLevelTrainer, vocab_size, vocab_size);
|
|
236
236
|
}
|
|
237
237
|
|
|
238
|
-
fn word_level_trainer_min_frequency(&self) ->
|
|
238
|
+
fn word_level_trainer_min_frequency(&self) -> u64 {
|
|
239
239
|
getter!(self, WordLevelTrainer, min_frequency)
|
|
240
240
|
}
|
|
241
241
|
|
|
242
|
-
fn word_level_trainer_set_min_frequency(&self, freq:
|
|
242
|
+
fn word_level_trainer_set_min_frequency(&self, freq: u64) {
|
|
243
243
|
setter!(self, WordLevelTrainer, min_frequency, freq);
|
|
244
244
|
}
|
|
245
245
|
|
|
@@ -289,11 +289,11 @@ impl RbTrainer {
|
|
|
289
289
|
setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
|
|
290
290
|
}
|
|
291
291
|
|
|
292
|
-
fn word_piece_trainer_min_frequency(&self) ->
|
|
292
|
+
fn word_piece_trainer_min_frequency(&self) -> u64 {
|
|
293
293
|
getter!(self, WordPieceTrainer, min_frequency())
|
|
294
294
|
}
|
|
295
295
|
|
|
296
|
-
fn word_piece_trainer_set_min_frequency(&self, freq:
|
|
296
|
+
fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
|
|
297
297
|
setter!(self, WordPieceTrainer, @set_min_frequency, freq);
|
|
298
298
|
}
|
|
299
299
|
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-02-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|