tokenizers 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +19 -13
- data/README.md +40 -4
- data/ext/tokenizers/Cargo.toml +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
|
4
|
+
data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
|
7
|
+
data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
25
|
|
26
26
|
[[package]]
|
27
27
|
name = "bindgen"
|
28
|
-
version = "0.
|
28
|
+
version = "0.69.1"
|
29
29
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
30
|
+
checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
|
31
31
|
dependencies = [
|
32
|
-
"bitflags",
|
32
|
+
"bitflags 2.4.1",
|
33
33
|
"cexpr",
|
34
34
|
"clang-sys",
|
35
35
|
"lazy_static",
|
@@ -40,7 +40,7 @@ dependencies = [
|
|
40
40
|
"regex",
|
41
41
|
"rustc-hash",
|
42
42
|
"shlex",
|
43
|
-
"syn
|
43
|
+
"syn 2.0.38",
|
44
44
|
]
|
45
45
|
|
46
46
|
[[package]]
|
@@ -49,6 +49,12 @@ version = "1.3.2"
|
|
49
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
50
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
51
51
|
|
52
|
+
[[package]]
|
53
|
+
name = "bitflags"
|
54
|
+
version = "2.4.1"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
57
|
+
|
52
58
|
[[package]]
|
53
59
|
name = "cc"
|
54
60
|
version = "1.0.79"
|
@@ -335,9 +341,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
335
341
|
|
336
342
|
[[package]]
|
337
343
|
name = "magnus"
|
338
|
-
version = "0.6.
|
344
|
+
version = "0.6.2"
|
339
345
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
340
|
-
checksum = "
|
346
|
+
checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
|
341
347
|
dependencies = [
|
342
348
|
"magnus-macros",
|
343
349
|
"rb-sys",
|
@@ -426,7 +432,7 @@ version = "6.4.0"
|
|
426
432
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
427
433
|
checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
|
428
434
|
dependencies = [
|
429
|
-
"bitflags",
|
435
|
+
"bitflags 1.3.2",
|
430
436
|
"libc",
|
431
437
|
"once_cell",
|
432
438
|
"onig_sys",
|
@@ -553,18 +559,18 @@ dependencies = [
|
|
553
559
|
|
554
560
|
[[package]]
|
555
561
|
name = "rb-sys"
|
556
|
-
version = "0.9.
|
562
|
+
version = "0.9.86"
|
557
563
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
558
|
-
checksum = "
|
564
|
+
checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
|
559
565
|
dependencies = [
|
560
566
|
"rb-sys-build",
|
561
567
|
]
|
562
568
|
|
563
569
|
[[package]]
|
564
570
|
name = "rb-sys-build"
|
565
|
-
version = "0.9.
|
571
|
+
version = "0.9.86"
|
566
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
567
|
-
checksum = "
|
573
|
+
checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
|
568
574
|
dependencies = [
|
569
575
|
"bindgen",
|
570
576
|
"lazy_static",
|
@@ -572,7 +578,7 @@ dependencies = [
|
|
572
578
|
"quote",
|
573
579
|
"regex",
|
574
580
|
"shell-words",
|
575
|
-
"syn
|
581
|
+
"syn 2.0.38",
|
576
582
|
]
|
577
583
|
|
578
584
|
[[package]]
|
@@ -745,7 +751,7 @@ dependencies = [
|
|
745
751
|
|
746
752
|
[[package]]
|
747
753
|
name = "tokenizers"
|
748
|
-
version = "0.4.
|
754
|
+
version = "0.4.3"
|
749
755
|
dependencies = [
|
750
756
|
"magnus",
|
751
757
|
"onig",
|
data/README.md
CHANGED
@@ -34,15 +34,51 @@ Decode
|
|
34
34
|
tokenizer.decode(ids)
|
35
35
|
```
|
36
36
|
|
37
|
-
|
37
|
+
## Training
|
38
|
+
|
39
|
+
Create a tokenizer
|
38
40
|
|
39
41
|
```ruby
|
40
|
-
tokenizer = Tokenizers::
|
42
|
+
tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
|
41
43
|
```
|
42
44
|
|
43
|
-
|
45
|
+
Set the pre-tokenizer
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
|
49
|
+
```
|
50
|
+
|
51
|
+
Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
55
|
+
tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
|
56
|
+
```
|
57
|
+
|
58
|
+
Encode
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
62
|
+
output.tokens
|
63
|
+
```
|
64
|
+
|
65
|
+
Save the tokenizer to a file
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
tokenizer.save("tokenizer.json")
|
69
|
+
```
|
70
|
+
|
71
|
+
Load a tokenizer from a file
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
tokenizer = Tokenizers.from_file("tokenizer.json")
|
75
|
+
```
|
76
|
+
|
77
|
+
Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
|
78
|
+
|
79
|
+
## API
|
44
80
|
|
45
|
-
|
81
|
+
This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
46
82
|
|
47
83
|
## History
|
48
84
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.
|
103
|
+
rubygems_version: 3.5.3
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|