tokenizers 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +28 -13
- data/README.md +1 -1
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/trainers.rs +6 -6
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
|
4
|
+
data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
|
7
|
+
data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -280,6 +280,15 @@ dependencies = [
|
|
280
280
|
"either",
|
281
281
|
]
|
282
282
|
|
283
|
+
[[package]]
|
284
|
+
name = "itertools"
|
285
|
+
version = "0.12.1"
|
286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
287
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
288
|
+
dependencies = [
|
289
|
+
"either",
|
290
|
+
]
|
291
|
+
|
283
292
|
[[package]]
|
284
293
|
name = "itoa"
|
285
294
|
version = "1.0.6"
|
@@ -543,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
543
552
|
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
|
544
553
|
dependencies = [
|
545
554
|
"either",
|
546
|
-
"itertools",
|
555
|
+
"itertools 0.11.0",
|
547
556
|
"rayon",
|
548
557
|
]
|
549
558
|
|
@@ -559,18 +568,18 @@ dependencies = [
|
|
559
568
|
|
560
569
|
[[package]]
|
561
570
|
name = "rb-sys"
|
562
|
-
version = "0.9.
|
571
|
+
version = "0.9.89"
|
563
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
564
|
-
checksum = "
|
573
|
+
checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
|
565
574
|
dependencies = [
|
566
575
|
"rb-sys-build",
|
567
576
|
]
|
568
577
|
|
569
578
|
[[package]]
|
570
579
|
name = "rb-sys-build"
|
571
|
-
version = "0.9.
|
580
|
+
version = "0.9.89"
|
572
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
573
|
-
checksum = "
|
582
|
+
checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
|
574
583
|
dependencies = [
|
575
584
|
"bindgen",
|
576
585
|
"lazy_static",
|
@@ -596,7 +605,7 @@ dependencies = [
|
|
596
605
|
"aho-corasick",
|
597
606
|
"memchr",
|
598
607
|
"regex-automata",
|
599
|
-
"regex-syntax",
|
608
|
+
"regex-syntax 0.7.5",
|
600
609
|
]
|
601
610
|
|
602
611
|
[[package]]
|
@@ -607,7 +616,7 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
|
|
607
616
|
dependencies = [
|
608
617
|
"aho-corasick",
|
609
618
|
"memchr",
|
610
|
-
"regex-syntax",
|
619
|
+
"regex-syntax 0.7.5",
|
611
620
|
]
|
612
621
|
|
613
622
|
[[package]]
|
@@ -616,6 +625,12 @@ version = "0.7.5"
|
|
616
625
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
617
626
|
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
618
627
|
|
628
|
+
[[package]]
|
629
|
+
name = "regex-syntax"
|
630
|
+
version = "0.8.2"
|
631
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
632
|
+
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
633
|
+
|
619
634
|
[[package]]
|
620
635
|
name = "rustc-hash"
|
621
636
|
version = "1.1.0"
|
@@ -751,26 +766,26 @@ dependencies = [
|
|
751
766
|
|
752
767
|
[[package]]
|
753
768
|
name = "tokenizers"
|
754
|
-
version = "0.4.
|
769
|
+
version = "0.4.4"
|
755
770
|
dependencies = [
|
756
771
|
"magnus",
|
757
772
|
"onig",
|
758
773
|
"serde",
|
759
|
-
"tokenizers 0.15.
|
774
|
+
"tokenizers 0.15.2",
|
760
775
|
]
|
761
776
|
|
762
777
|
[[package]]
|
763
778
|
name = "tokenizers"
|
764
|
-
version = "0.15.
|
779
|
+
version = "0.15.2"
|
765
780
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
766
|
-
checksum = "
|
781
|
+
checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
|
767
782
|
dependencies = [
|
768
783
|
"aho-corasick",
|
769
784
|
"derive_builder",
|
770
785
|
"esaxx-rs",
|
771
786
|
"getrandom",
|
772
787
|
"indicatif",
|
773
|
-
"itertools",
|
788
|
+
"itertools 0.12.1",
|
774
789
|
"lazy_static",
|
775
790
|
"log",
|
776
791
|
"macro_rules_attribute",
|
@@ -781,7 +796,7 @@ dependencies = [
|
|
781
796
|
"rayon",
|
782
797
|
"rayon-cond",
|
783
798
|
"regex",
|
784
|
-
"regex-syntax",
|
799
|
+
"regex-syntax 0.8.2",
|
785
800
|
"serde",
|
786
801
|
"serde_json",
|
787
802
|
"spm_precompiled",
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
:slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://github.com/ankane/tokenizers-ruby/workflows/build/badge.svg
|
5
|
+
[![Build Status](https://github.com/ankane/tokenizers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tokenizers-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.4.
|
3
|
+
version = "0.4.4"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.15.
|
19
|
+
version = "=0.15.2" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -77,11 +77,11 @@ impl RbTrainer {
|
|
77
77
|
setter!(self, BpeTrainer, vocab_size, vocab_size);
|
78
78
|
}
|
79
79
|
|
80
|
-
fn bpe_trainer_min_frequency(&self) ->
|
80
|
+
fn bpe_trainer_min_frequency(&self) -> u64 {
|
81
81
|
getter!(self, BpeTrainer, min_frequency)
|
82
82
|
}
|
83
83
|
|
84
|
-
fn bpe_trainer_set_min_frequency(&self, freq:
|
84
|
+
fn bpe_trainer_set_min_frequency(&self, freq: u64) {
|
85
85
|
setter!(self, BpeTrainer, min_frequency, freq);
|
86
86
|
}
|
87
87
|
|
@@ -235,11 +235,11 @@ impl RbTrainer {
|
|
235
235
|
setter!(self, WordLevelTrainer, vocab_size, vocab_size);
|
236
236
|
}
|
237
237
|
|
238
|
-
fn word_level_trainer_min_frequency(&self) ->
|
238
|
+
fn word_level_trainer_min_frequency(&self) -> u64 {
|
239
239
|
getter!(self, WordLevelTrainer, min_frequency)
|
240
240
|
}
|
241
241
|
|
242
|
-
fn word_level_trainer_set_min_frequency(&self, freq:
|
242
|
+
fn word_level_trainer_set_min_frequency(&self, freq: u64) {
|
243
243
|
setter!(self, WordLevelTrainer, min_frequency, freq);
|
244
244
|
}
|
245
245
|
|
@@ -289,11 +289,11 @@ impl RbTrainer {
|
|
289
289
|
setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
|
290
290
|
}
|
291
291
|
|
292
|
-
fn word_piece_trainer_min_frequency(&self) ->
|
292
|
+
fn word_piece_trainer_min_frequency(&self) -> u64 {
|
293
293
|
getter!(self, WordPieceTrainer, min_frequency())
|
294
294
|
}
|
295
295
|
|
296
|
-
fn word_piece_trainer_set_min_frequency(&self, freq:
|
296
|
+
fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
|
297
297
|
setter!(self, WordPieceTrainer, @set_min_frequency, freq);
|
298
298
|
}
|
299
299
|
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|