tokenizers 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +4 -0
 - data/Cargo.lock +19 -13
 - data/README.md +40 -4
 - data/ext/tokenizers/Cargo.toml +1 -1
 - data/lib/tokenizers/version.rb +1 -1
 - metadata +3 -3
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 31ba3313f98f5360a6e9b0434c674d113622cf0421c6e7610ba250a8a9c79402
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: ba7c1913bdafa2b58835ac3689a34c37206b1a567c634f81aedaea0ca21a20cf
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 7cac28260eea675e5cea80324fa755681e9a8a06ce38fc501c57da69056d36d0485576dac1237da21a7d23ca24dacc46b6b1a92d4c7b5f91644fda37b5550ada
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 84c361eadb625a96234b454f91b7f9a847010e42927042978003e3219e1b28d1da8d7665bc9cda7aa820b6053e5c6be1458cdaee6fe1d4709df632fb744155b4
         
     | 
    
        data/CHANGELOG.md
    CHANGED
    
    
    
        data/Cargo.lock
    CHANGED
    
    | 
         @@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" 
     | 
|
| 
       25 
25 
     | 
    
         | 
| 
       26 
26 
     | 
    
         
             
            [[package]]
         
     | 
| 
       27 
27 
     | 
    
         
             
            name = "bindgen"
         
     | 
| 
       28 
     | 
    
         
            -
            version = "0. 
     | 
| 
      
 28 
     | 
    
         
            +
            version = "0.69.1"
         
     | 
| 
       29 
29 
     | 
    
         
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         
     | 
| 
       30 
     | 
    
         
            -
            checksum = " 
     | 
| 
      
 30 
     | 
    
         
            +
            checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
         
     | 
| 
       31 
31 
     | 
    
         
             
            dependencies = [
         
     | 
| 
       32 
     | 
    
         
            -
             "bitflags",
         
     | 
| 
      
 32 
     | 
    
         
            +
             "bitflags 2.4.1",
         
     | 
| 
       33 
33 
     | 
    
         
             
             "cexpr",
         
     | 
| 
       34 
34 
     | 
    
         
             
             "clang-sys",
         
     | 
| 
       35 
35 
     | 
    
         
             
             "lazy_static",
         
     | 
| 
         @@ -40,7 +40,7 @@ dependencies = [ 
     | 
|
| 
       40 
40 
     | 
    
         
             
             "regex",
         
     | 
| 
       41 
41 
     | 
    
         
             
             "rustc-hash",
         
     | 
| 
       42 
42 
     | 
    
         
             
             "shlex",
         
     | 
| 
       43 
     | 
    
         
            -
             "syn  
     | 
| 
      
 43 
     | 
    
         
            +
             "syn 2.0.38",
         
     | 
| 
       44 
44 
     | 
    
         
             
            ]
         
     | 
| 
       45 
45 
     | 
    
         | 
| 
       46 
46 
     | 
    
         
             
            [[package]]
         
     | 
| 
         @@ -49,6 +49,12 @@ version = "1.3.2" 
     | 
|
| 
       49 
49 
     | 
    
         
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         
     | 
| 
       50 
50 
     | 
    
         
             
            checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
         
     | 
| 
       51 
51 
     | 
    
         | 
| 
      
 52 
     | 
    
         
            +
            [[package]]
         
     | 
| 
      
 53 
     | 
    
         
            +
            name = "bitflags"
         
     | 
| 
      
 54 
     | 
    
         
            +
            version = "2.4.1"
         
     | 
| 
      
 55 
     | 
    
         
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         
     | 
| 
      
 56 
     | 
    
         
            +
            checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
       52 
58 
     | 
    
         
             
            [[package]]
         
     | 
| 
       53 
59 
     | 
    
         
             
            name = "cc"
         
     | 
| 
       54 
60 
     | 
    
         
             
            version = "1.0.79"
         
     | 
| 
         @@ -335,9 +341,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" 
     | 
|
| 
       335 
341 
     | 
    
         | 
| 
       336 
342 
     | 
    
         
             
            [[package]]
         
     | 
| 
       337 
343 
     | 
    
         
             
            name = "magnus"
         
     | 
| 
       338 
     | 
    
         
            -
            version = "0.6. 
     | 
| 
      
 344 
     | 
    
         
            +
            version = "0.6.2"
         
     | 
| 
       339 
345 
     | 
    
         
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         
     | 
| 
       340 
     | 
    
         
            -
            checksum = " 
     | 
| 
      
 346 
     | 
    
         
            +
            checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
         
     | 
| 
       341 
347 
     | 
    
         
             
            dependencies = [
         
     | 
| 
       342 
348 
     | 
    
         
             
             "magnus-macros",
         
     | 
| 
       343 
349 
     | 
    
         
             
             "rb-sys",
         
     | 
| 
         @@ -426,7 +432,7 @@ version = "6.4.0" 
     | 
|
| 
       426 
432 
     | 
    
         
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         
     | 
| 
       427 
433 
     | 
    
         
             
            checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
         
     | 
| 
       428 
434 
     | 
    
         
             
            dependencies = [
         
     | 
| 
       429 
     | 
    
         
            -
             "bitflags",
         
     | 
| 
      
 435 
     | 
    
         
            +
             "bitflags 1.3.2",
         
     | 
| 
       430 
436 
     | 
    
         
             
             "libc",
         
     | 
| 
       431 
437 
     | 
    
         
             
             "once_cell",
         
     | 
| 
       432 
438 
     | 
    
         
             
             "onig_sys",
         
     | 
| 
         @@ -553,18 +559,18 @@ dependencies = [ 
     | 
|
| 
       553 
559 
     | 
    
         | 
| 
       554 
560 
     | 
    
         
             
            [[package]]
         
     | 
| 
       555 
561 
     | 
    
         
             
            name = "rb-sys"
         
     | 
| 
       556 
     | 
    
         
            -
            version = "0.9. 
     | 
| 
      
 562 
     | 
    
         
            +
            version = "0.9.86"
         
     | 
| 
       557 
563 
     | 
    
         
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         
     | 
| 
       558 
     | 
    
         
            -
            checksum = " 
     | 
| 
      
 564 
     | 
    
         
            +
            checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
         
     | 
| 
       559 
565 
     | 
    
         
             
            dependencies = [
         
     | 
| 
       560 
566 
     | 
    
         
             
             "rb-sys-build",
         
     | 
| 
       561 
567 
     | 
    
         
             
            ]
         
     | 
| 
       562 
568 
     | 
    
         | 
| 
       563 
569 
     | 
    
         
             
            [[package]]
         
     | 
| 
       564 
570 
     | 
    
         
             
            name = "rb-sys-build"
         
     | 
| 
       565 
     | 
    
         
            -
            version = "0.9. 
     | 
| 
      
 571 
     | 
    
         
            +
            version = "0.9.86"
         
     | 
| 
       566 
572 
     | 
    
         
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         
     | 
| 
       567 
     | 
    
         
            -
            checksum = " 
     | 
| 
      
 573 
     | 
    
         
            +
            checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
         
     | 
| 
       568 
574 
     | 
    
         
             
            dependencies = [
         
     | 
| 
       569 
575 
     | 
    
         
             
             "bindgen",
         
     | 
| 
       570 
576 
     | 
    
         
             
             "lazy_static",
         
     | 
| 
         @@ -572,7 +578,7 @@ dependencies = [ 
     | 
|
| 
       572 
578 
     | 
    
         
             
             "quote",
         
     | 
| 
       573 
579 
     | 
    
         
             
             "regex",
         
     | 
| 
       574 
580 
     | 
    
         
             
             "shell-words",
         
     | 
| 
       575 
     | 
    
         
            -
             "syn  
     | 
| 
      
 581 
     | 
    
         
            +
             "syn 2.0.38",
         
     | 
| 
       576 
582 
     | 
    
         
             
            ]
         
     | 
| 
       577 
583 
     | 
    
         | 
| 
       578 
584 
     | 
    
         
             
            [[package]]
         
     | 
| 
         @@ -745,7 +751,7 @@ dependencies = [ 
     | 
|
| 
       745 
751 
     | 
    
         | 
| 
       746 
752 
     | 
    
         
             
            [[package]]
         
     | 
| 
       747 
753 
     | 
    
         
             
            name = "tokenizers"
         
     | 
| 
       748 
     | 
    
         
            -
            version = "0.4. 
     | 
| 
      
 754 
     | 
    
         
            +
            version = "0.4.3"
         
     | 
| 
       749 
755 
     | 
    
         
             
            dependencies = [
         
     | 
| 
       750 
756 
     | 
    
         
             
             "magnus",
         
     | 
| 
       751 
757 
     | 
    
         
             
             "onig",
         
     | 
    
        data/README.md
    CHANGED
    
    | 
         @@ -34,15 +34,51 @@ Decode 
     | 
|
| 
       34 
34 
     | 
    
         
             
            tokenizer.decode(ids)
         
     | 
| 
       35 
35 
     | 
    
         
             
            ```
         
     | 
| 
       36 
36 
     | 
    
         | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
      
 37 
     | 
    
         
            +
            ## Training
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
            Create a tokenizer
         
     | 
| 
       38 
40 
     | 
    
         | 
| 
       39 
41 
     | 
    
         
             
            ```ruby
         
     | 
| 
       40 
     | 
    
         
            -
            tokenizer = Tokenizers:: 
     | 
| 
      
 42 
     | 
    
         
            +
            tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
         
     | 
| 
       41 
43 
     | 
    
         
             
            ```
         
     | 
| 
       42 
44 
     | 
    
         | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
      
 45 
     | 
    
         
            +
            Set the pre-tokenizer
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 48 
     | 
    
         
            +
            tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
         
     | 
| 
      
 49 
     | 
    
         
            +
            ```
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 54 
     | 
    
         
            +
            trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
         
     | 
| 
      
 55 
     | 
    
         
            +
            tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
         
     | 
| 
      
 56 
     | 
    
         
            +
            ```
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
            Encode
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 61 
     | 
    
         
            +
            output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
         
     | 
| 
      
 62 
     | 
    
         
            +
            output.tokens
         
     | 
| 
      
 63 
     | 
    
         
            +
            ```
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
            Save the tokenizer to a file
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 68 
     | 
    
         
            +
            tokenizer.save("tokenizer.json")
         
     | 
| 
      
 69 
     | 
    
         
            +
            ```
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
            Load a tokenizer from a file
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 74 
     | 
    
         
            +
            tokenizer = Tokenizers.from_file("tokenizer.json")
         
     | 
| 
      
 75 
     | 
    
         
            +
            ```
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
            Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
            ## API
         
     | 
| 
       44 
80 
     | 
    
         | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
      
 81 
     | 
    
         
            +
            This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
         
     | 
| 
       46 
82 
     | 
    
         | 
| 
       47 
83 
     | 
    
         
             
            ## History
         
     | 
| 
       48 
84 
     | 
    
         | 
    
        data/ext/tokenizers/Cargo.toml
    CHANGED
    
    
    
        data/lib/tokenizers/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: tokenizers
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.4. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.4.3
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Andrew Kane
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2024-01-04 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: rb_sys
         
     | 
| 
         @@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       100 
100 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       101 
101 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       102 
102 
     | 
    
         
             
            requirements: []
         
     | 
| 
       103 
     | 
    
         
            -
            rubygems_version: 3. 
     | 
| 
      
 103 
     | 
    
         
            +
            rubygems_version: 3.5.3
         
     | 
| 
       104 
104 
     | 
    
         
             
            signing_key:
         
     | 
| 
       105 
105 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       106 
106 
     | 
    
         
             
            summary: Fast state-of-the-art tokenizers for Ruby
         
     |