tokenizers 0.4.2 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +42 -21
- data/README.md +41 -5
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/trainers.rs +6 -6
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
|
4
|
+
data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
|
7
|
+
data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
25
|
|
26
26
|
[[package]]
|
27
27
|
name = "bindgen"
|
28
|
-
version = "0.
|
28
|
+
version = "0.69.1"
|
29
29
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
30
|
+
checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
|
31
31
|
dependencies = [
|
32
|
-
"bitflags",
|
32
|
+
"bitflags 2.4.1",
|
33
33
|
"cexpr",
|
34
34
|
"clang-sys",
|
35
35
|
"lazy_static",
|
@@ -40,7 +40,7 @@ dependencies = [
|
|
40
40
|
"regex",
|
41
41
|
"rustc-hash",
|
42
42
|
"shlex",
|
43
|
-
"syn
|
43
|
+
"syn 2.0.38",
|
44
44
|
]
|
45
45
|
|
46
46
|
[[package]]
|
@@ -49,6 +49,12 @@ version = "1.3.2"
|
|
49
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
50
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
51
51
|
|
52
|
+
[[package]]
|
53
|
+
name = "bitflags"
|
54
|
+
version = "2.4.1"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
57
|
+
|
52
58
|
[[package]]
|
53
59
|
name = "cc"
|
54
60
|
version = "1.0.79"
|
@@ -274,6 +280,15 @@ dependencies = [
|
|
274
280
|
"either",
|
275
281
|
]
|
276
282
|
|
283
|
+
[[package]]
|
284
|
+
name = "itertools"
|
285
|
+
version = "0.12.1"
|
286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
287
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
288
|
+
dependencies = [
|
289
|
+
"either",
|
290
|
+
]
|
291
|
+
|
277
292
|
[[package]]
|
278
293
|
name = "itoa"
|
279
294
|
version = "1.0.6"
|
@@ -335,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
335
350
|
|
336
351
|
[[package]]
|
337
352
|
name = "magnus"
|
338
|
-
version = "0.6.
|
353
|
+
version = "0.6.2"
|
339
354
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
340
|
-
checksum = "
|
355
|
+
checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
|
341
356
|
dependencies = [
|
342
357
|
"magnus-macros",
|
343
358
|
"rb-sys",
|
@@ -426,7 +441,7 @@ version = "6.4.0"
|
|
426
441
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
427
442
|
checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
|
428
443
|
dependencies = [
|
429
|
-
"bitflags",
|
444
|
+
"bitflags 1.3.2",
|
430
445
|
"libc",
|
431
446
|
"once_cell",
|
432
447
|
"onig_sys",
|
@@ -537,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
537
552
|
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
|
538
553
|
dependencies = [
|
539
554
|
"either",
|
540
|
-
"itertools",
|
555
|
+
"itertools 0.11.0",
|
541
556
|
"rayon",
|
542
557
|
]
|
543
558
|
|
@@ -553,18 +568,18 @@ dependencies = [
|
|
553
568
|
|
554
569
|
[[package]]
|
555
570
|
name = "rb-sys"
|
556
|
-
version = "0.9.
|
571
|
+
version = "0.9.89"
|
557
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
558
|
-
checksum = "
|
573
|
+
checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
|
559
574
|
dependencies = [
|
560
575
|
"rb-sys-build",
|
561
576
|
]
|
562
577
|
|
563
578
|
[[package]]
|
564
579
|
name = "rb-sys-build"
|
565
|
-
version = "0.9.
|
580
|
+
version = "0.9.89"
|
566
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
567
|
-
checksum = "
|
582
|
+
checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
|
568
583
|
dependencies = [
|
569
584
|
"bindgen",
|
570
585
|
"lazy_static",
|
@@ -572,7 +587,7 @@ dependencies = [
|
|
572
587
|
"quote",
|
573
588
|
"regex",
|
574
589
|
"shell-words",
|
575
|
-
"syn
|
590
|
+
"syn 2.0.38",
|
576
591
|
]
|
577
592
|
|
578
593
|
[[package]]
|
@@ -590,7 +605,7 @@ dependencies = [
|
|
590
605
|
"aho-corasick",
|
591
606
|
"memchr",
|
592
607
|
"regex-automata",
|
593
|
-
"regex-syntax",
|
608
|
+
"regex-syntax 0.7.5",
|
594
609
|
]
|
595
610
|
|
596
611
|
[[package]]
|
@@ -601,7 +616,7 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
|
|
601
616
|
dependencies = [
|
602
617
|
"aho-corasick",
|
603
618
|
"memchr",
|
604
|
-
"regex-syntax",
|
619
|
+
"regex-syntax 0.7.5",
|
605
620
|
]
|
606
621
|
|
607
622
|
[[package]]
|
@@ -610,6 +625,12 @@ version = "0.7.5"
|
|
610
625
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
611
626
|
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
612
627
|
|
628
|
+
[[package]]
|
629
|
+
name = "regex-syntax"
|
630
|
+
version = "0.8.2"
|
631
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
632
|
+
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
633
|
+
|
613
634
|
[[package]]
|
614
635
|
name = "rustc-hash"
|
615
636
|
version = "1.1.0"
|
@@ -745,26 +766,26 @@ dependencies = [
|
|
745
766
|
|
746
767
|
[[package]]
|
747
768
|
name = "tokenizers"
|
748
|
-
version = "0.4.
|
769
|
+
version = "0.4.4"
|
749
770
|
dependencies = [
|
750
771
|
"magnus",
|
751
772
|
"onig",
|
752
773
|
"serde",
|
753
|
-
"tokenizers 0.15.
|
774
|
+
"tokenizers 0.15.2",
|
754
775
|
]
|
755
776
|
|
756
777
|
[[package]]
|
757
778
|
name = "tokenizers"
|
758
|
-
version = "0.15.
|
779
|
+
version = "0.15.2"
|
759
780
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
760
|
-
checksum = "
|
781
|
+
checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
|
761
782
|
dependencies = [
|
762
783
|
"aho-corasick",
|
763
784
|
"derive_builder",
|
764
785
|
"esaxx-rs",
|
765
786
|
"getrandom",
|
766
787
|
"indicatif",
|
767
|
-
"itertools",
|
788
|
+
"itertools 0.12.1",
|
768
789
|
"lazy_static",
|
769
790
|
"log",
|
770
791
|
"macro_rules_attribute",
|
@@ -775,7 +796,7 @@ dependencies = [
|
|
775
796
|
"rayon",
|
776
797
|
"rayon-cond",
|
777
798
|
"regex",
|
778
|
-
"regex-syntax",
|
799
|
+
"regex-syntax 0.8.2",
|
779
800
|
"serde",
|
780
801
|
"serde_json",
|
781
802
|
"spm_precompiled",
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
:slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://github.com/ankane/tokenizers-ruby/workflows/build/badge.svg
|
5
|
+
[![Build Status](https://github.com/ankane/tokenizers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tokenizers-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -34,15 +34,51 @@ Decode
|
|
34
34
|
tokenizer.decode(ids)
|
35
35
|
```
|
36
36
|
|
37
|
-
|
37
|
+
## Training
|
38
|
+
|
39
|
+
Create a tokenizer
|
38
40
|
|
39
41
|
```ruby
|
40
|
-
tokenizer = Tokenizers::
|
42
|
+
tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
|
41
43
|
```
|
42
44
|
|
43
|
-
|
45
|
+
Set the pre-tokenizer
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
|
49
|
+
```
|
50
|
+
|
51
|
+
Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
55
|
+
tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
|
56
|
+
```
|
57
|
+
|
58
|
+
Encode
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
62
|
+
output.tokens
|
63
|
+
```
|
64
|
+
|
65
|
+
Save the tokenizer to a file
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
tokenizer.save("tokenizer.json")
|
69
|
+
```
|
70
|
+
|
71
|
+
Load a tokenizer from a file
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
tokenizer = Tokenizers.from_file("tokenizer.json")
|
75
|
+
```
|
76
|
+
|
77
|
+
Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
|
78
|
+
|
79
|
+
## API
|
44
80
|
|
45
|
-
|
81
|
+
This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
46
82
|
|
47
83
|
## History
|
48
84
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.4.
|
3
|
+
version = "0.4.4"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
17
17
|
|
18
18
|
[dependencies.tokenizers]
|
19
|
-
version = "=0.15.
|
19
|
+
version = "=0.15.2" # also update in from_pretrained.rb
|
20
20
|
default-features = false
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -77,11 +77,11 @@ impl RbTrainer {
|
|
77
77
|
setter!(self, BpeTrainer, vocab_size, vocab_size);
|
78
78
|
}
|
79
79
|
|
80
|
-
fn bpe_trainer_min_frequency(&self) ->
|
80
|
+
fn bpe_trainer_min_frequency(&self) -> u64 {
|
81
81
|
getter!(self, BpeTrainer, min_frequency)
|
82
82
|
}
|
83
83
|
|
84
|
-
fn bpe_trainer_set_min_frequency(&self, freq:
|
84
|
+
fn bpe_trainer_set_min_frequency(&self, freq: u64) {
|
85
85
|
setter!(self, BpeTrainer, min_frequency, freq);
|
86
86
|
}
|
87
87
|
|
@@ -235,11 +235,11 @@ impl RbTrainer {
|
|
235
235
|
setter!(self, WordLevelTrainer, vocab_size, vocab_size);
|
236
236
|
}
|
237
237
|
|
238
|
-
fn word_level_trainer_min_frequency(&self) ->
|
238
|
+
fn word_level_trainer_min_frequency(&self) -> u64 {
|
239
239
|
getter!(self, WordLevelTrainer, min_frequency)
|
240
240
|
}
|
241
241
|
|
242
|
-
fn word_level_trainer_set_min_frequency(&self, freq:
|
242
|
+
fn word_level_trainer_set_min_frequency(&self, freq: u64) {
|
243
243
|
setter!(self, WordLevelTrainer, min_frequency, freq);
|
244
244
|
}
|
245
245
|
|
@@ -289,11 +289,11 @@ impl RbTrainer {
|
|
289
289
|
setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
|
290
290
|
}
|
291
291
|
|
292
|
-
fn word_piece_trainer_min_frequency(&self) ->
|
292
|
+
fn word_piece_trainer_min_frequency(&self) -> u64 {
|
293
293
|
getter!(self, WordPieceTrainer, min_frequency())
|
294
294
|
}
|
295
295
|
|
296
|
-
fn word_piece_trainer_set_min_frequency(&self, freq:
|
296
|
+
fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
|
297
297
|
setter!(self, WordPieceTrainer, @set_min_frequency, freq);
|
298
298
|
}
|
299
299
|
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.
|
103
|
+
rubygems_version: 3.5.3
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|