tokenizers 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +19 -13
- data/README.md +40 -4
- data/ext/tokenizers/Cargo.toml +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
|
4
|
+
data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
|
7
|
+
data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
25
|
|
26
26
|
[[package]]
|
27
27
|
name = "bindgen"
|
28
|
-
version = "0.
|
28
|
+
version = "0.69.1"
|
29
29
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
30
|
+
checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
|
31
31
|
dependencies = [
|
32
|
-
"bitflags",
|
32
|
+
"bitflags 2.4.1",
|
33
33
|
"cexpr",
|
34
34
|
"clang-sys",
|
35
35
|
"lazy_static",
|
@@ -40,7 +40,7 @@ dependencies = [
|
|
40
40
|
"regex",
|
41
41
|
"rustc-hash",
|
42
42
|
"shlex",
|
43
|
-
"syn
|
43
|
+
"syn 2.0.38",
|
44
44
|
]
|
45
45
|
|
46
46
|
[[package]]
|
@@ -49,6 +49,12 @@ version = "1.3.2"
|
|
49
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
50
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
51
51
|
|
52
|
+
[[package]]
|
53
|
+
name = "bitflags"
|
54
|
+
version = "2.4.1"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
57
|
+
|
52
58
|
[[package]]
|
53
59
|
name = "cc"
|
54
60
|
version = "1.0.79"
|
@@ -335,9 +341,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
335
341
|
|
336
342
|
[[package]]
|
337
343
|
name = "magnus"
|
338
|
-
version = "0.6.
|
344
|
+
version = "0.6.2"
|
339
345
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
340
|
-
checksum = "
|
346
|
+
checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
|
341
347
|
dependencies = [
|
342
348
|
"magnus-macros",
|
343
349
|
"rb-sys",
|
@@ -426,7 +432,7 @@ version = "6.4.0"
|
|
426
432
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
427
433
|
checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
|
428
434
|
dependencies = [
|
429
|
-
"bitflags",
|
435
|
+
"bitflags 1.3.2",
|
430
436
|
"libc",
|
431
437
|
"once_cell",
|
432
438
|
"onig_sys",
|
@@ -553,18 +559,18 @@ dependencies = [
|
|
553
559
|
|
554
560
|
[[package]]
|
555
561
|
name = "rb-sys"
|
556
|
-
version = "0.9.
|
562
|
+
version = "0.9.86"
|
557
563
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
558
|
-
checksum = "
|
564
|
+
checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
|
559
565
|
dependencies = [
|
560
566
|
"rb-sys-build",
|
561
567
|
]
|
562
568
|
|
563
569
|
[[package]]
|
564
570
|
name = "rb-sys-build"
|
565
|
-
version = "0.9.
|
571
|
+
version = "0.9.86"
|
566
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
567
|
-
checksum = "
|
573
|
+
checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
|
568
574
|
dependencies = [
|
569
575
|
"bindgen",
|
570
576
|
"lazy_static",
|
@@ -572,7 +578,7 @@ dependencies = [
|
|
572
578
|
"quote",
|
573
579
|
"regex",
|
574
580
|
"shell-words",
|
575
|
-
"syn
|
581
|
+
"syn 2.0.38",
|
576
582
|
]
|
577
583
|
|
578
584
|
[[package]]
|
@@ -745,7 +751,7 @@ dependencies = [
|
|
745
751
|
|
746
752
|
[[package]]
|
747
753
|
name = "tokenizers"
|
748
|
-
version = "0.4.
|
754
|
+
version = "0.4.3"
|
749
755
|
dependencies = [
|
750
756
|
"magnus",
|
751
757
|
"onig",
|
data/README.md
CHANGED
@@ -34,15 +34,51 @@ Decode
|
|
34
34
|
tokenizer.decode(ids)
|
35
35
|
```
|
36
36
|
|
37
|
-
|
37
|
+
## Training
|
38
|
+
|
39
|
+
Create a tokenizer
|
38
40
|
|
39
41
|
```ruby
|
40
|
-
tokenizer = Tokenizers::
|
42
|
+
tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
|
41
43
|
```
|
42
44
|
|
43
|
-
|
45
|
+
Set the pre-tokenizer
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
|
49
|
+
```
|
50
|
+
|
51
|
+
Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
55
|
+
tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
|
56
|
+
```
|
57
|
+
|
58
|
+
Encode
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
62
|
+
output.tokens
|
63
|
+
```
|
64
|
+
|
65
|
+
Save the tokenizer to a file
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
tokenizer.save("tokenizer.json")
|
69
|
+
```
|
70
|
+
|
71
|
+
Load a tokenizer from a file
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
tokenizer = Tokenizers.from_file("tokenizer.json")
|
75
|
+
```
|
76
|
+
|
77
|
+
Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
|
78
|
+
|
79
|
+
## API
|
44
80
|
|
45
|
-
|
81
|
+
This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
46
82
|
|
47
83
|
## History
|
48
84
|
|
data/ext/tokenizers/Cargo.toml
CHANGED
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
100
|
- !ruby/object:Gem::Version
|
101
101
|
version: '0'
|
102
102
|
requirements: []
|
103
|
-
rubygems_version: 3.
|
103
|
+
rubygems_version: 3.5.3
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|