tokenizers 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +42 -21
- data/README.md +41 -5
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/trainers.rs +6 -6
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fe2e8b2ec97ac4dabb4e401cb426079c065be420bf8da41c9484b3d38290589c
|
|
4
|
+
data.tar.gz: bba3b9d9d94a278e3f5189fda48b3b3a7f1ff369b66ae8b85945e92a2d8b90ed
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fd5876cc95c22b917f1a5715f9b9651f714291f017c00f83316ad2c3cc48c8819d4de70288b4067570c9f729cca9d06cbfa8562996a61ac694e202268a58cdea
|
|
7
|
+
data.tar.gz: cb793a73aabafe933ef8d7259b7a36848b1f38cacdf20ef3f6f8554671b7de6ece53b3c14f46f368f540ecbab5dfca087f730e99c22caf84c7b35d539b07b633
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
|
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
|
25
25
|
|
|
26
26
|
[[package]]
|
|
27
27
|
name = "bindgen"
|
|
28
|
-
version = "0.
|
|
28
|
+
version = "0.69.1"
|
|
29
29
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
30
|
-
checksum = "
|
|
30
|
+
checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
|
|
31
31
|
dependencies = [
|
|
32
|
-
"bitflags",
|
|
32
|
+
"bitflags 2.4.1",
|
|
33
33
|
"cexpr",
|
|
34
34
|
"clang-sys",
|
|
35
35
|
"lazy_static",
|
|
@@ -40,7 +40,7 @@ dependencies = [
|
|
|
40
40
|
"regex",
|
|
41
41
|
"rustc-hash",
|
|
42
42
|
"shlex",
|
|
43
|
-
"syn
|
|
43
|
+
"syn 2.0.38",
|
|
44
44
|
]
|
|
45
45
|
|
|
46
46
|
[[package]]
|
|
@@ -49,6 +49,12 @@ version = "1.3.2"
|
|
|
49
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
50
50
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|
51
51
|
|
|
52
|
+
[[package]]
|
|
53
|
+
name = "bitflags"
|
|
54
|
+
version = "2.4.1"
|
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
56
|
+
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
|
57
|
+
|
|
52
58
|
[[package]]
|
|
53
59
|
name = "cc"
|
|
54
60
|
version = "1.0.79"
|
|
@@ -274,6 +280,15 @@ dependencies = [
|
|
|
274
280
|
"either",
|
|
275
281
|
]
|
|
276
282
|
|
|
283
|
+
[[package]]
|
|
284
|
+
name = "itertools"
|
|
285
|
+
version = "0.12.1"
|
|
286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
287
|
+
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
|
288
|
+
dependencies = [
|
|
289
|
+
"either",
|
|
290
|
+
]
|
|
291
|
+
|
|
277
292
|
[[package]]
|
|
278
293
|
name = "itoa"
|
|
279
294
|
version = "1.0.6"
|
|
@@ -335,9 +350,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
|
335
350
|
|
|
336
351
|
[[package]]
|
|
337
352
|
name = "magnus"
|
|
338
|
-
version = "0.6.
|
|
353
|
+
version = "0.6.2"
|
|
339
354
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
340
|
-
checksum = "
|
|
355
|
+
checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
|
|
341
356
|
dependencies = [
|
|
342
357
|
"magnus-macros",
|
|
343
358
|
"rb-sys",
|
|
@@ -426,7 +441,7 @@ version = "6.4.0"
|
|
|
426
441
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
427
442
|
checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
|
|
428
443
|
dependencies = [
|
|
429
|
-
"bitflags",
|
|
444
|
+
"bitflags 1.3.2",
|
|
430
445
|
"libc",
|
|
431
446
|
"once_cell",
|
|
432
447
|
"onig_sys",
|
|
@@ -537,7 +552,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
537
552
|
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
|
|
538
553
|
dependencies = [
|
|
539
554
|
"either",
|
|
540
|
-
"itertools",
|
|
555
|
+
"itertools 0.11.0",
|
|
541
556
|
"rayon",
|
|
542
557
|
]
|
|
543
558
|
|
|
@@ -553,18 +568,18 @@ dependencies = [
|
|
|
553
568
|
|
|
554
569
|
[[package]]
|
|
555
570
|
name = "rb-sys"
|
|
556
|
-
version = "0.9.
|
|
571
|
+
version = "0.9.89"
|
|
557
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
558
|
-
checksum = "
|
|
573
|
+
checksum = "0d197f2c03751ef006f29d593d22aa9068c9c358e04ca503afea0329c366147c"
|
|
559
574
|
dependencies = [
|
|
560
575
|
"rb-sys-build",
|
|
561
576
|
]
|
|
562
577
|
|
|
563
578
|
[[package]]
|
|
564
579
|
name = "rb-sys-build"
|
|
565
|
-
version = "0.9.
|
|
580
|
+
version = "0.9.89"
|
|
566
581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
567
|
-
checksum = "
|
|
582
|
+
checksum = "2b50caf8fd028f12abe00d6debe2ae2adf6202c9ca3caa59487eda710d90fa28"
|
|
568
583
|
dependencies = [
|
|
569
584
|
"bindgen",
|
|
570
585
|
"lazy_static",
|
|
@@ -572,7 +587,7 @@ dependencies = [
|
|
|
572
587
|
"quote",
|
|
573
588
|
"regex",
|
|
574
589
|
"shell-words",
|
|
575
|
-
"syn
|
|
590
|
+
"syn 2.0.38",
|
|
576
591
|
]
|
|
577
592
|
|
|
578
593
|
[[package]]
|
|
@@ -590,7 +605,7 @@ dependencies = [
|
|
|
590
605
|
"aho-corasick",
|
|
591
606
|
"memchr",
|
|
592
607
|
"regex-automata",
|
|
593
|
-
"regex-syntax",
|
|
608
|
+
"regex-syntax 0.7.5",
|
|
594
609
|
]
|
|
595
610
|
|
|
596
611
|
[[package]]
|
|
@@ -601,7 +616,7 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
|
|
|
601
616
|
dependencies = [
|
|
602
617
|
"aho-corasick",
|
|
603
618
|
"memchr",
|
|
604
|
-
"regex-syntax",
|
|
619
|
+
"regex-syntax 0.7.5",
|
|
605
620
|
]
|
|
606
621
|
|
|
607
622
|
[[package]]
|
|
@@ -610,6 +625,12 @@ version = "0.7.5"
|
|
|
610
625
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
611
626
|
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
|
612
627
|
|
|
628
|
+
[[package]]
|
|
629
|
+
name = "regex-syntax"
|
|
630
|
+
version = "0.8.2"
|
|
631
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
632
|
+
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
|
633
|
+
|
|
613
634
|
[[package]]
|
|
614
635
|
name = "rustc-hash"
|
|
615
636
|
version = "1.1.0"
|
|
@@ -745,26 +766,26 @@ dependencies = [
|
|
|
745
766
|
|
|
746
767
|
[[package]]
|
|
747
768
|
name = "tokenizers"
|
|
748
|
-
version = "0.4.
|
|
769
|
+
version = "0.4.4"
|
|
749
770
|
dependencies = [
|
|
750
771
|
"magnus",
|
|
751
772
|
"onig",
|
|
752
773
|
"serde",
|
|
753
|
-
"tokenizers 0.15.
|
|
774
|
+
"tokenizers 0.15.2",
|
|
754
775
|
]
|
|
755
776
|
|
|
756
777
|
[[package]]
|
|
757
778
|
name = "tokenizers"
|
|
758
|
-
version = "0.15.
|
|
779
|
+
version = "0.15.2"
|
|
759
780
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
760
|
-
checksum = "
|
|
781
|
+
checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d"
|
|
761
782
|
dependencies = [
|
|
762
783
|
"aho-corasick",
|
|
763
784
|
"derive_builder",
|
|
764
785
|
"esaxx-rs",
|
|
765
786
|
"getrandom",
|
|
766
787
|
"indicatif",
|
|
767
|
-
"itertools",
|
|
788
|
+
"itertools 0.12.1",
|
|
768
789
|
"lazy_static",
|
|
769
790
|
"log",
|
|
770
791
|
"macro_rules_attribute",
|
|
@@ -775,7 +796,7 @@ dependencies = [
|
|
|
775
796
|
"rayon",
|
|
776
797
|
"rayon-cond",
|
|
777
798
|
"regex",
|
|
778
|
-
"regex-syntax",
|
|
799
|
+
"regex-syntax 0.8.2",
|
|
779
800
|
"serde",
|
|
780
801
|
"serde_json",
|
|
781
802
|
"spm_precompiled",
|
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
:slightly_smiling_face: Fast state-of-the-art [tokenizers](https://github.com/huggingface/tokenizers) for Ruby
|
|
4
4
|
|
|
5
|
-
[](https://github.com/ankane/tokenizers-ruby/actions)
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
@@ -34,15 +34,51 @@ Decode
|
|
|
34
34
|
tokenizer.decode(ids)
|
|
35
35
|
```
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
## Training
|
|
38
|
+
|
|
39
|
+
Create a tokenizer
|
|
38
40
|
|
|
39
41
|
```ruby
|
|
40
|
-
tokenizer = Tokenizers::
|
|
42
|
+
tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
|
|
41
43
|
```
|
|
42
44
|
|
|
43
|
-
|
|
45
|
+
Set the pre-tokenizer
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
|
55
|
+
tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Encode
|
|
59
|
+
|
|
60
|
+
```ruby
|
|
61
|
+
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
|
62
|
+
output.tokens
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Save the tokenizer to a file
|
|
66
|
+
|
|
67
|
+
```ruby
|
|
68
|
+
tokenizer.save("tokenizer.json")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Load a tokenizer from a file
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
tokenizer = Tokenizers.from_file("tokenizer.json")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
|
|
78
|
+
|
|
79
|
+
## API
|
|
44
80
|
|
|
45
|
-
|
|
81
|
+
This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
|
46
82
|
|
|
47
83
|
## History
|
|
48
84
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.4"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
@@ -16,6 +16,6 @@ onig = { version = "6", default-features = false }
|
|
|
16
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
|
17
17
|
|
|
18
18
|
[dependencies.tokenizers]
|
|
19
|
-
version = "=0.15.
|
|
19
|
+
version = "=0.15.2" # also update in from_pretrained.rb
|
|
20
20
|
default-features = false
|
|
21
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -77,11 +77,11 @@ impl RbTrainer {
|
|
|
77
77
|
setter!(self, BpeTrainer, vocab_size, vocab_size);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
fn bpe_trainer_min_frequency(&self) ->
|
|
80
|
+
fn bpe_trainer_min_frequency(&self) -> u64 {
|
|
81
81
|
getter!(self, BpeTrainer, min_frequency)
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
-
fn bpe_trainer_set_min_frequency(&self, freq:
|
|
84
|
+
fn bpe_trainer_set_min_frequency(&self, freq: u64) {
|
|
85
85
|
setter!(self, BpeTrainer, min_frequency, freq);
|
|
86
86
|
}
|
|
87
87
|
|
|
@@ -235,11 +235,11 @@ impl RbTrainer {
|
|
|
235
235
|
setter!(self, WordLevelTrainer, vocab_size, vocab_size);
|
|
236
236
|
}
|
|
237
237
|
|
|
238
|
-
fn word_level_trainer_min_frequency(&self) ->
|
|
238
|
+
fn word_level_trainer_min_frequency(&self) -> u64 {
|
|
239
239
|
getter!(self, WordLevelTrainer, min_frequency)
|
|
240
240
|
}
|
|
241
241
|
|
|
242
|
-
fn word_level_trainer_set_min_frequency(&self, freq:
|
|
242
|
+
fn word_level_trainer_set_min_frequency(&self, freq: u64) {
|
|
243
243
|
setter!(self, WordLevelTrainer, min_frequency, freq);
|
|
244
244
|
}
|
|
245
245
|
|
|
@@ -289,11 +289,11 @@ impl RbTrainer {
|
|
|
289
289
|
setter!(self, WordPieceTrainer, @set_vocab_size, vocab_size);
|
|
290
290
|
}
|
|
291
291
|
|
|
292
|
-
fn word_piece_trainer_min_frequency(&self) ->
|
|
292
|
+
fn word_piece_trainer_min_frequency(&self) -> u64 {
|
|
293
293
|
getter!(self, WordPieceTrainer, min_frequency())
|
|
294
294
|
}
|
|
295
295
|
|
|
296
|
-
fn word_piece_trainer_set_min_frequency(&self, freq:
|
|
296
|
+
fn word_piece_trainer_set_min_frequency(&self, freq: u64) {
|
|
297
297
|
setter!(self, WordPieceTrainer, @set_min_frequency, freq);
|
|
298
298
|
}
|
|
299
299
|
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-02-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
100
100
|
- !ruby/object:Gem::Version
|
|
101
101
|
version: '0'
|
|
102
102
|
requirements: []
|
|
103
|
-
rubygems_version: 3.
|
|
103
|
+
rubygems_version: 3.5.3
|
|
104
104
|
signing_key:
|
|
105
105
|
specification_version: 4
|
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|