tokenizers 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 954a1e29eea94d08df1d0a0c667d4554f0f979947e2c714114f5aa16db19ad11
4
- data.tar.gz: 910f44e5c05115dce6ee1fe1070138ba1a826a1142a5f43ff104990687cc7814
3
+ metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
4
+ data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
5
5
  SHA512:
6
- metadata.gz: 8d88bcfaacae964414ad175f4f27f4e20bdf8131a4a97deabdc732f75c12ca2ad956ff3c3f0967d607b46748a8abe7fc1c94538ba9e9ddaa3abaff7132ec15ca
7
- data.tar.gz: 95c8e8225da0070aa947ac8c6436e90f490e5dff033f532f3c2d61af63c7c5beff36398b6a78b4c6e69eca7c4d64834ea43d8c988e3ccdae8ee30e0c84f126bd
6
+ metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
7
+ data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.4.3 (2024-01-03)
2
+
3
+ - Added support for Ruby 3.3
4
+
1
5
  ## 0.4.2 (2023-11-16)
2
6
 
3
7
  - Updated Tokenizers to 0.15.0
data/Cargo.lock CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
25
25
 
26
26
  [[package]]
27
27
  name = "bindgen"
28
- version = "0.62.0"
28
+ version = "0.69.1"
29
29
  source = "registry+https://github.com/rust-lang/crates.io-index"
30
- checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
30
+ checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
31
31
  dependencies = [
32
- "bitflags",
32
+ "bitflags 2.4.1",
33
33
  "cexpr",
34
34
  "clang-sys",
35
35
  "lazy_static",
@@ -40,7 +40,7 @@ dependencies = [
40
40
  "regex",
41
41
  "rustc-hash",
42
42
  "shlex",
43
- "syn 1.0.109",
43
+ "syn 2.0.38",
44
44
  ]
45
45
 
46
46
  [[package]]
@@ -49,6 +49,12 @@ version = "1.3.2"
49
49
  source = "registry+https://github.com/rust-lang/crates.io-index"
50
50
  checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
51
51
 
52
+ [[package]]
53
+ name = "bitflags"
54
+ version = "2.4.1"
55
+ source = "registry+https://github.com/rust-lang/crates.io-index"
56
+ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
57
+
52
58
  [[package]]
53
59
  name = "cc"
54
60
  version = "1.0.79"
@@ -335,9 +341,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
335
341
 
336
342
  [[package]]
337
343
  name = "magnus"
338
- version = "0.6.0"
344
+ version = "0.6.2"
339
345
  source = "registry+https://github.com/rust-lang/crates.io-index"
340
- checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
346
+ checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
341
347
  dependencies = [
342
348
  "magnus-macros",
343
349
  "rb-sys",
@@ -426,7 +432,7 @@ version = "6.4.0"
426
432
  source = "registry+https://github.com/rust-lang/crates.io-index"
427
433
  checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
428
434
  dependencies = [
429
- "bitflags",
435
+ "bitflags 1.3.2",
430
436
  "libc",
431
437
  "once_cell",
432
438
  "onig_sys",
@@ -553,18 +559,18 @@ dependencies = [
553
559
 
554
560
  [[package]]
555
561
  name = "rb-sys"
556
- version = "0.9.79"
562
+ version = "0.9.86"
557
563
  source = "registry+https://github.com/rust-lang/crates.io-index"
558
- checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
564
+ checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
559
565
  dependencies = [
560
566
  "rb-sys-build",
561
567
  ]
562
568
 
563
569
  [[package]]
564
570
  name = "rb-sys-build"
565
- version = "0.9.79"
571
+ version = "0.9.86"
566
572
  source = "registry+https://github.com/rust-lang/crates.io-index"
567
- checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
573
+ checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
568
574
  dependencies = [
569
575
  "bindgen",
570
576
  "lazy_static",
@@ -572,7 +578,7 @@ dependencies = [
572
578
  "quote",
573
579
  "regex",
574
580
  "shell-words",
575
- "syn 1.0.109",
581
+ "syn 2.0.38",
576
582
  ]
577
583
 
578
584
  [[package]]
@@ -745,7 +751,7 @@ dependencies = [
745
751
 
746
752
  [[package]]
747
753
  name = "tokenizers"
748
- version = "0.4.2"
754
+ version = "0.4.3"
749
755
  dependencies = [
750
756
  "magnus",
751
757
  "onig",
data/README.md CHANGED
@@ -34,15 +34,51 @@ Decode
34
34
  tokenizer.decode(ids)
35
35
  ```
36
36
 
37
- Load a tokenizer from files
37
+ ## Training
38
+
39
+ Create a tokenizer
38
40
 
39
41
  ```ruby
40
- tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
42
+ tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
41
43
  ```
42
44
 
43
- ## Training
45
+ Set the pre-tokenizer
46
+
47
+ ```ruby
48
+ tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
49
+ ```
50
+
51
+ Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
52
+
53
+ ```ruby
54
+ trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
55
+ tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
56
+ ```
57
+
58
+ Encode
59
+
60
+ ```ruby
61
+ output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
62
+ output.tokens
63
+ ```
64
+
65
+ Save the tokenizer to a file
66
+
67
+ ```ruby
68
+ tokenizer.save("tokenizer.json")
69
+ ```
70
+
71
+ Load a tokenizer from a file
72
+
73
+ ```ruby
74
+ tokenizer = Tokenizers.from_file("tokenizer.json")
75
+ ```
76
+
77
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
78
+
79
+ ## API
44
80
 
45
- Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
81
+ This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
46
82
 
47
83
  ## History
48
84
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.4.2"
3
+ version = "0.4.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.2"
2
+ VERSION = "0.4.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-16 00:00:00.000000000 Z
11
+ date: 2024-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  - !ruby/object:Gem::Version
101
101
  version: '0'
102
102
  requirements: []
103
- rubygems_version: 3.4.10
103
+ rubygems_version: 3.5.3
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: Fast state-of-the-art tokenizers for Ruby