tokenizers 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 954a1e29eea94d08df1d0a0c667d4554f0f979947e2c714114f5aa16db19ad11
4
- data.tar.gz: 910f44e5c05115dce6ee1fe1070138ba1a826a1142a5f43ff104990687cc7814
3
+ metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
4
+ data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
5
5
  SHA512:
6
- metadata.gz: 8d88bcfaacae964414ad175f4f27f4e20bdf8131a4a97deabdc732f75c12ca2ad956ff3c3f0967d607b46748a8abe7fc1c94538ba9e9ddaa3abaff7132ec15ca
7
- data.tar.gz: 95c8e8225da0070aa947ac8c6436e90f490e5dff033f532f3c2d61af63c7c5beff36398b6a78b4c6e69eca7c4d64834ea43d8c988e3ccdae8ee30e0c84f126bd
6
+ metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
7
+ data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.4.3 (2024-01-03)
2
+
3
+ - Added support for Ruby 3.3
4
+
1
5
  ## 0.4.2 (2023-11-16)
2
6
 
3
7
  - Updated Tokenizers to 0.15.0
data/Cargo.lock CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
25
25
 
26
26
  [[package]]
27
27
  name = "bindgen"
28
- version = "0.62.0"
28
+ version = "0.69.1"
29
29
  source = "registry+https://github.com/rust-lang/crates.io-index"
30
- checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
30
+ checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
31
31
  dependencies = [
32
- "bitflags",
32
+ "bitflags 2.4.1",
33
33
  "cexpr",
34
34
  "clang-sys",
35
35
  "lazy_static",
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "regex",
41
41
  "rustc-hash",
42
42
  "shlex",
43
- "syn 1.0.109",
43
+ "syn 2.0.38",
44
44
  ]
45
45
 
46
46
  [[package]]
@@ -49,6 +49,12 @@ version = "1.3.2"
49
49
  source = "registry+https://github.com/rust-lang/crates.io-index"
50
50
  checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
51
51
 
52
+ [[package]]
53
+ name = "bitflags"
54
+ version = "2.4.1"
55
+ source = "registry+https://github.com/rust-lang/crates.io-index"
56
+ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
57
+
52
58
  [[package]]
53
59
  name = "cc"
54
60
  version = "1.0.79"
@@ -335,9 +341,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
335
341
 
336
342
  [[package]]
337
343
  name = "magnus"
338
- version = "0.6.0"
344
+ version = "0.6.2"
339
345
  source = "registry+https://github.com/rust-lang/crates.io-index"
340
- checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
346
+ checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
341
347
  dependencies = [
342
348
  "magnus-macros",
343
349
  "rb-sys",
@@ -426,7 +432,7 @@ version = "6.4.0"
426
432
  source = "registry+https://github.com/rust-lang/crates.io-index"
427
433
  checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
428
434
  dependencies = [
429
- "bitflags",
435
+ "bitflags 1.3.2",
430
436
  "libc",
431
437
  "once_cell",
432
438
  "onig_sys",
@@ -553,18 +559,18 @@ dependencies = [
553
559
 
554
560
  [[package]]
555
561
  name = "rb-sys"
556
- version = "0.9.79"
562
+ version = "0.9.86"
557
563
  source = "registry+https://github.com/rust-lang/crates.io-index"
558
- checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
564
+ checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
559
565
  dependencies = [
560
566
  "rb-sys-build",
561
567
  ]
562
568
 
563
569
  [[package]]
564
570
  name = "rb-sys-build"
565
- version = "0.9.79"
571
+ version = "0.9.86"
566
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
567
- checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
573
+ checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
568
574
  dependencies = [
569
575
  "bindgen",
570
576
  "lazy_static",
@@ -572,7 +578,7 @@ dependencies = [
572
578
  "quote",
573
579
  "regex",
574
580
  "shell-words",
575
- "syn 1.0.109",
581
+ "syn 2.0.38",
576
582
  ]
577
583
 
578
584
  [[package]]
@@ -745,7 +751,7 @@ dependencies = [
745
751
 
746
752
  [[package]]
747
753
  name = "tokenizers"
748
- version = "0.4.2"
754
+ version = "0.4.3"
749
755
  dependencies = [
750
756
  "magnus",
751
757
  "onig",
data/README.md CHANGED
@@ -34,15 +34,51 @@ Decode
34
34
  tokenizer.decode(ids)
35
35
  ```
36
36
 
37
- Load a tokenizer from files
37
+ ## Training
38
+
39
+ Create a tokenizer
38
40
 
39
41
  ```ruby
40
- tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
42
+ tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
41
43
  ```
42
44
 
43
- ## Training
45
+ Set the pre-tokenizer
46
+
47
+ ```ruby
48
+ tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
49
+ ```
50
+
51
+ Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
52
+
53
+ ```ruby
54
+ trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
55
+ tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
56
+ ```
57
+
58
+ Encode
59
+
60
+ ```ruby
61
+ output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
62
+ output.tokens
63
+ ```
64
+
65
+ Save the tokenizer to a file
66
+
67
+ ```ruby
68
+ tokenizer.save("tokenizer.json")
69
+ ```
70
+
71
+ Load a tokenizer from a file
72
+
73
+ ```ruby
74
+ tokenizer = Tokenizers.from_file("tokenizer.json")
75
+ ```
76
+
77
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
78
+
79
+ ## API
44
80
 
45
- Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
81
+ This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
46
82
 
47
83
  ## History
48
84
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.2"
3
+ version = "0.4.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.2"
2
+ VERSION = "0.4.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-16 00:00:00.000000000 Z
11
+ date: 2024-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.4.10
103
+ rubygems_version: 3.5.3
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby