tiktoken_ruby 0.0.3-x64-mingw-ucrt → 0.0.4-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/Gemfile +0 -1
 - data/Gemfile.lock +2 -7
 - data/README.md +12 -1
 - data/Rakefile +0 -1
 - data/doctest_helper.rb +1 -1
 - data/lib/tiktoken_ruby/3.1/tiktoken_ruby.so +0 -0
 - data/lib/tiktoken_ruby/3.2/tiktoken_ruby.so +0 -0
 - data/lib/tiktoken_ruby/encoding.rb +48 -47
 - data/lib/tiktoken_ruby/version.rb +1 -1
 - data/lib/tiktoken_ruby.rb +8 -9
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 1d3ae645bd74a9706f834cfddab56cfba8233233243f0f9f7a6e38f453ea0d7f
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: c9aba2e4ffecf42dda1ec11b40e22c929dc471cf0fdffcbaa243e477b7aa18dd
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 9163832bedcebe8d97b2319862bc7230ea78441b0883655392671e9269360dd74a65234b79a2277876a786e2473b0be3feadea6b865283217ed2a75b019de2f2
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: ece1e1a541aa4b63818096b837ae1ae2e0f4fef91e1ff6baebfd2471f10fb746d6830d47c170d7fd524adede6d7411abe78bfa650ef4e27997c5064ef871373a
         
     | 
    
        data/Gemfile
    CHANGED
    
    
    
        data/Gemfile.lock
    CHANGED
    
    | 
         @@ -1,24 +1,19 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            PATH
         
     | 
| 
       2 
2 
     | 
    
         
             
              remote: .
         
     | 
| 
       3 
3 
     | 
    
         
             
              specs:
         
     | 
| 
       4 
     | 
    
         
            -
                tiktoken_ruby (0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
                tiktoken_ruby (0.0.4)
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            GEM
         
     | 
| 
       7 
7 
     | 
    
         
             
              remote: https://rubygems.org/
         
     | 
| 
       8 
8 
     | 
    
         
             
              specs:
         
     | 
| 
       9 
9 
     | 
    
         
             
                ast (2.4.2)
         
     | 
| 
       10 
     | 
    
         
            -
                coderay (1.1.3)
         
     | 
| 
       11 
10 
     | 
    
         
             
                diff-lcs (1.5.0)
         
     | 
| 
       12 
11 
     | 
    
         
             
                json (2.6.3)
         
     | 
| 
       13 
12 
     | 
    
         
             
                language_server-protocol (3.17.0.3)
         
     | 
| 
       14 
     | 
    
         
            -
                method_source (1.0.0)
         
     | 
| 
       15 
13 
     | 
    
         
             
                minitest (5.18.0)
         
     | 
| 
       16 
14 
     | 
    
         
             
                parallel (1.22.1)
         
     | 
| 
       17 
15 
     | 
    
         
             
                parser (3.2.1.1)
         
     | 
| 
       18 
16 
     | 
    
         
             
                  ast (~> 2.4.1)
         
     | 
| 
       19 
     | 
    
         
            -
                pry (0.14.2)
         
     | 
| 
       20 
     | 
    
         
            -
                  coderay (~> 1.1)
         
     | 
| 
       21 
     | 
    
         
            -
                  method_source (~> 1.0)
         
     | 
| 
       22 
17 
     | 
    
         
             
                rainbow (3.1.1)
         
     | 
| 
       23 
18 
     | 
    
         
             
                rake (13.0.6)
         
     | 
| 
       24 
19 
     | 
    
         
             
                rake-compiler (1.2.1)
         
     | 
| 
         @@ -69,10 +64,10 @@ GEM 
     | 
|
| 
       69 
64 
     | 
    
         | 
| 
       70 
65 
     | 
    
         
             
            PLATFORMS
         
     | 
| 
       71 
66 
     | 
    
         
             
              arm64-darwin-22
         
     | 
| 
      
 67 
     | 
    
         
            +
              x86_64-darwin-22
         
     | 
| 
       72 
68 
     | 
    
         
             
              x86_64-linux
         
     | 
| 
       73 
69 
     | 
    
         | 
| 
       74 
70 
     | 
    
         
             
            DEPENDENCIES
         
     | 
| 
       75 
     | 
    
         
            -
              pry (~> 0.14.2)
         
     | 
| 
       76 
71 
     | 
    
         
             
              rake (~> 13.0)
         
     | 
| 
       77 
72 
     | 
    
         
             
              rake-compiler
         
     | 
| 
       78 
73 
     | 
    
         
             
              rb_sys
         
     | 
    
        data/README.md
    CHANGED
    
    | 
         @@ -15,7 +15,7 @@ If bundler is not being used to manage dependencies, install the gem by executin 
     | 
|
| 
       15 
15 
     | 
    
         
             
                $ gem install tiktoken_ruby
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
17 
     | 
    
         
             
            ## Usage
         
     | 
| 
       18 
     | 
    
         
            -
            Usage should be very similar to the python library.  
     | 
| 
      
 18 
     | 
    
         
            +
            Usage should be very similar to the python library. Here's a simple example
         
     | 
| 
       19 
19 
     | 
    
         | 
| 
       20 
20 
     | 
    
         
             
            Encode and decode text
         
     | 
| 
       21 
21 
     | 
    
         
             
            ```ruby
         
     | 
| 
         @@ -43,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To 
     | 
|
| 
       43 
43 
     | 
    
         | 
| 
       44 
44 
     | 
    
         
             
            Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
         
     | 
| 
       45 
45 
     | 
    
         | 
| 
      
 46 
     | 
    
         
            +
            To get started with development:
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
            ```sh
         
     | 
| 
      
 49 
     | 
    
         
            +
            git clone https://github.com/IAPark/tiktoken_ruby.git
         
     | 
| 
      
 50 
     | 
    
         
            +
            cd tiktoken_ruby
         
     | 
| 
      
 51 
     | 
    
         
            +
            bundle install
         
     | 
| 
      
 52 
     | 
    
         
            +
            bundle exec rake compile
         
     | 
| 
      
 53 
     | 
    
         
            +
            bundle exec rake spec
         
     | 
| 
      
 54 
     | 
    
         
            +
            ```
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
       46 
57 
     | 
    
         
             
            ## License
         
     | 
| 
       47 
58 
     | 
    
         | 
| 
       48 
59 
     | 
    
         
             
            The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
         
     | 
    
        data/Rakefile
    CHANGED
    
    
    
        data/doctest_helper.rb
    CHANGED
    
    | 
         @@ -1 +1 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            require  
     | 
| 
      
 1 
     | 
    
         
            +
            require "lib/tiktoken_ruby"
         
     | 
| 
         Binary file 
     | 
| 
         Binary file 
     | 
| 
         @@ -1,51 +1,52 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            class Tiktoken::Encoding
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
       50 
     | 
    
         
            -
                 
     | 
| 
      
 4 
     | 
    
         
            +
              attr_reader :name
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
              # This returns a new Tiktoken::Encoding instance for the requested encoding
         
     | 
| 
      
 7 
     | 
    
         
            +
              # @param encoding [Symbol] The name of the encoding to load
         
     | 
| 
      
 8 
     | 
    
         
            +
              # @return [Tiktoken::Encoding] The encoding instance
         
     | 
| 
      
 9 
     | 
    
         
            +
              def self.for_name(encoding)
         
     | 
| 
      
 10 
     | 
    
         
            +
                Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
              # This returns a Tiktoken::Encoding instance for the requested encoding
         
     | 
| 
      
 14 
     | 
    
         
            +
              # It will reuse an existing encoding if it's already been loaded
         
     | 
| 
      
 15 
     | 
    
         
            +
              # @param encoding [Symbol] The name of the encoding to load
         
     | 
| 
      
 16 
     | 
    
         
            +
              # @return [Tiktoken::Encoding] The encoding instance
         
     | 
| 
      
 17 
     | 
    
         
            +
              def self.for_name_cached(encoding)
         
     | 
| 
      
 18 
     | 
    
         
            +
                @encodings ||= {}
         
     | 
| 
      
 19 
     | 
    
         
            +
                @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
         
     | 
| 
      
 20 
     | 
    
         
            +
              end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
              # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
         
     | 
| 
      
 23 
     | 
    
         
            +
              # basically it's unescaped
         
     | 
| 
      
 24 
     | 
    
         
            +
              # @param text [String] The text to encode
         
     | 
| 
      
 25 
     | 
    
         
            +
              # @return [Array<Integer>] The encoded tokens
         
     | 
| 
      
 26 
     | 
    
         
            +
              def encode_ordinary(text)
         
     | 
| 
      
 27 
     | 
    
         
            +
                @ext_base_bpe.encode_ordinary(text)
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
              # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
         
     | 
| 
      
 31 
     | 
    
         
            +
              # as text unless they're in the allowed_special array. It's basically like the text was escaped
         
     | 
| 
      
 32 
     | 
    
         
            +
              # @param text [String] The text to encode
         
     | 
| 
      
 33 
     | 
    
         
            +
              # @param allowed_special [Array<String>] An array of special tokens to allow
         
     | 
| 
      
 34 
     | 
    
         
            +
              # @return [Array<Integer>] The encoded tokens
         
     | 
| 
      
 35 
     | 
    
         
            +
              def encode(text, allowed_special: [])
         
     | 
| 
      
 36 
     | 
    
         
            +
                @ext_base_bpe.encode(text, allowed_special)
         
     | 
| 
      
 37 
     | 
    
         
            +
              end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
              # Decodes the tokens back into text
         
     | 
| 
      
 40 
     | 
    
         
            +
              # @param tokens [Array<Integer>] The tokens to decode
         
     | 
| 
      
 41 
     | 
    
         
            +
              # @return [String] The decoded text
         
     | 
| 
      
 42 
     | 
    
         
            +
              def decode(tokens)
         
     | 
| 
      
 43 
     | 
    
         
            +
                @ext_base_bpe.decode(tokens)
         
     | 
| 
      
 44 
     | 
    
         
            +
              end
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
              private
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
              def initialize(ext_base_bpe, name)
         
     | 
| 
      
 49 
     | 
    
         
            +
                @ext_base_bpe = ext_base_bpe
         
     | 
| 
      
 50 
     | 
    
         
            +
                @name = name
         
     | 
| 
      
 51 
     | 
    
         
            +
              end
         
     | 
| 
       51 
52 
     | 
    
         
             
            end
         
     | 
    
        data/lib/tiktoken_ruby.rb
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require_relative "tiktoken_ruby/version"
         
     | 
| 
       4 
     | 
    
         
            -
            require_relative "tiktoken_ruby/encoding 
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative "tiktoken_ruby/encoding"
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            begin
         
     | 
| 
       7 
7 
     | 
    
         
             
              RUBY_VERSION =~ /(\d+\.\d+)/
         
     | 
| 
         @@ -12,7 +12,6 @@ end 
     | 
|
| 
       12 
12 
     | 
    
         | 
| 
       13 
13 
     | 
    
         
             
            module Tiktoken
         
     | 
| 
       14 
14 
     | 
    
         
             
              class << self
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
15 
     | 
    
         
             
                # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
         
     | 
| 
       17 
16 
     | 
    
         
             
                # it will reuse the instance of that type that was previous loaded
         
     | 
| 
       18 
17 
     | 
    
         
             
                # @param name [Symbol|String] The name of the encoding to load
         
     | 
| 
         @@ -34,7 +33,7 @@ module Tiktoken 
     | 
|
| 
       34 
33 
     | 
    
         
             
                #   enc = Tiktoken.encoding_for_model("gpt-4")
         
     | 
| 
       35 
34 
     | 
    
         
             
                #   enc.encode("hello world").length #=> 2
         
     | 
| 
       36 
35 
     | 
    
         
             
                def encoding_for_model(model_name)
         
     | 
| 
       37 
     | 
    
         
            -
                   
     | 
| 
      
 36 
     | 
    
         
            +
                  PREFIX_MODELS.each do |prefix|
         
     | 
| 
       38 
37 
     | 
    
         
             
                    if model_name.to_s.start_with?("#{prefix}-")
         
     | 
| 
       39 
38 
     | 
    
         
             
                      model_name = prefix
         
     | 
| 
       40 
39 
     | 
    
         
             
                      break
         
     | 
| 
         @@ -65,7 +64,7 @@ module Tiktoken 
     | 
|
| 
       65 
64 
     | 
    
         
             
                  :r50k_base,
         
     | 
| 
       66 
65 
     | 
    
         
             
                  :p50k_base,
         
     | 
| 
       67 
66 
     | 
    
         
             
                  :p50k_edit,
         
     | 
| 
       68 
     | 
    
         
            -
                  :cl100k_base 
     | 
| 
      
 67 
     | 
    
         
            +
                  :cl100k_base
         
     | 
| 
       69 
68 
     | 
    
         
             
                ]
         
     | 
| 
       70 
69 
     | 
    
         | 
| 
       71 
70 
     | 
    
         
             
                # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
         
     | 
| 
         @@ -80,10 +79,10 @@ module Tiktoken 
     | 
|
| 
       80 
79 
     | 
    
         
             
                  "text-curie-001": "r50k_base",
         
     | 
| 
       81 
80 
     | 
    
         
             
                  "text-babbage-001": "r50k_base",
         
     | 
| 
       82 
81 
     | 
    
         
             
                  "text-ada-001": "r50k_base",
         
     | 
| 
       83 
     | 
    
         
            -
                   
     | 
| 
       84 
     | 
    
         
            -
                   
     | 
| 
       85 
     | 
    
         
            -
                   
     | 
| 
       86 
     | 
    
         
            -
                   
     | 
| 
      
 82 
     | 
    
         
            +
                  davinci: "r50k_base",
         
     | 
| 
      
 83 
     | 
    
         
            +
                  curie: "r50k_base",
         
     | 
| 
      
 84 
     | 
    
         
            +
                  babbage: "r50k_base",
         
     | 
| 
      
 85 
     | 
    
         
            +
                  ada: "r50k_base",
         
     | 
| 
       87 
86 
     | 
    
         
             
                  # code
         
     | 
| 
       88 
87 
     | 
    
         
             
                  "code-davinci-002": "p50k_base",
         
     | 
| 
       89 
88 
     | 
    
         
             
                  "code-davinci-001": "p50k_base",
         
     | 
| 
         @@ -106,7 +105,7 @@ module Tiktoken 
     | 
|
| 
       106 
105 
     | 
    
         
             
                  "text-search-babbage-doc-001": "r50k_base",
         
     | 
| 
       107 
106 
     | 
    
         
             
                  "text-search-ada-doc-001": "r50k_base",
         
     | 
| 
       108 
107 
     | 
    
         
             
                  "code-search-babbage-code-001": "r50k_base",
         
     | 
| 
       109 
     | 
    
         
            -
                  "code-search-ada-code-001": "r50k_base" 
     | 
| 
      
 108 
     | 
    
         
            +
                  "code-search-ada-code-001": "r50k_base"
         
     | 
| 
       110 
109 
     | 
    
         
             
                }
         
     | 
| 
       111 
110 
     | 
    
         | 
| 
       112 
111 
     | 
    
         
             
                # these are models that have a versioned models that are otherwise identical
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: tiktoken_ruby
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.4
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: x64-mingw-ucrt
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - IAPark
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2023-03- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2023-03-28 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       13 
13 
     | 
    
         
             
            description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
         
     | 
| 
       14 
14 
     | 
    
         
             
              used by OpenAI. It can be used to count the number of tokens in text before sending
         
     |