tiktoken_ruby 0.0.2-x86_64-linux → 0.0.4-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/Gemfile +2 -1
 - data/Gemfile.lock +10 -7
 - data/README.md +25 -3
 - data/Rakefile +0 -1
 - data/doctest_helper.rb +1 -0
 - data/lib/tiktoken_ruby/2.7/tiktoken_ruby.so +0 -0
 - data/lib/tiktoken_ruby/3.0/tiktoken_ruby.so +0 -0
 - data/lib/tiktoken_ruby/3.1/tiktoken_ruby.so +0 -0
 - data/lib/tiktoken_ruby/3.2/tiktoken_ruby.so +0 -0
 - data/lib/tiktoken_ruby/encoding.rb +49 -20
 - data/lib/tiktoken_ruby/version.rb +1 -1
 - data/lib/tiktoken_ruby.rb +101 -2
 - metadata +7 -3
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 3094946d0ce4e358c99e6de95455b132135a7d286e6bbe24fcf21f1e5ceea867
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: a66c81531ddf86d99c0b9f2bd993c0dfe5a00538d034c2c0648d84a1526f0d14
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: e1196d2e5bdff73ced49a4b7d9c8c88621cb93c8e598ab37ba2271a2e4d7e4fccae209fd8a04beb19c9f5c4306815b419ca2b41da31974f65809d8b02c38b2b5
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 24a4ca1948419483d29b1c22b704d4cd3133e0398acba412efc3f1b03127d4c6bdf1a4fae2f22bf59aa2d6175d36825ae213909380c848d790cc894a6aaeccb9
         
     | 
    
        data/Gemfile
    CHANGED
    
    
    
        data/Gemfile.lock
    CHANGED
    
    | 
         @@ -1,23 +1,19 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            PATH
         
     | 
| 
       2 
2 
     | 
    
         
             
              remote: .
         
     | 
| 
       3 
3 
     | 
    
         
             
              specs:
         
     | 
| 
       4 
     | 
    
         
            -
                tiktoken_ruby (0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
                tiktoken_ruby (0.0.4)
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            GEM
         
     | 
| 
       7 
7 
     | 
    
         
             
              remote: https://rubygems.org/
         
     | 
| 
       8 
8 
     | 
    
         
             
              specs:
         
     | 
| 
       9 
9 
     | 
    
         
             
                ast (2.4.2)
         
     | 
| 
       10 
     | 
    
         
            -
                coderay (1.1.3)
         
     | 
| 
       11 
10 
     | 
    
         
             
                diff-lcs (1.5.0)
         
     | 
| 
       12 
11 
     | 
    
         
             
                json (2.6.3)
         
     | 
| 
       13 
12 
     | 
    
         
             
                language_server-protocol (3.17.0.3)
         
     | 
| 
       14 
     | 
    
         
            -
                 
     | 
| 
      
 13 
     | 
    
         
            +
                minitest (5.18.0)
         
     | 
| 
       15 
14 
     | 
    
         
             
                parallel (1.22.1)
         
     | 
| 
       16 
15 
     | 
    
         
             
                parser (3.2.1.1)
         
     | 
| 
       17 
16 
     | 
    
         
             
                  ast (~> 2.4.1)
         
     | 
| 
       18 
     | 
    
         
            -
                pry (0.14.2)
         
     | 
| 
       19 
     | 
    
         
            -
                  coderay (~> 1.1)
         
     | 
| 
       20 
     | 
    
         
            -
                  method_source (~> 1.0)
         
     | 
| 
       21 
17 
     | 
    
         
             
                rainbow (3.1.1)
         
     | 
| 
       22 
18 
     | 
    
         
             
                rake (13.0.6)
         
     | 
| 
       23 
19 
     | 
    
         
             
                rake-compiler (1.2.1)
         
     | 
| 
         @@ -59,19 +55,26 @@ GEM 
     | 
|
| 
       59 
55 
     | 
    
         
             
                  rubocop (= 1.48.1)
         
     | 
| 
       60 
56 
     | 
    
         
             
                  rubocop-performance (= 1.16.0)
         
     | 
| 
       61 
57 
     | 
    
         
             
                unicode-display_width (2.4.2)
         
     | 
| 
      
 58 
     | 
    
         
            +
                webrick (1.7.0)
         
     | 
| 
      
 59 
     | 
    
         
            +
                yard (0.9.28)
         
     | 
| 
      
 60 
     | 
    
         
            +
                  webrick (~> 1.7.0)
         
     | 
| 
      
 61 
     | 
    
         
            +
                yard-doctest (0.1.17)
         
     | 
| 
      
 62 
     | 
    
         
            +
                  minitest
         
     | 
| 
      
 63 
     | 
    
         
            +
                  yard
         
     | 
| 
       62 
64 
     | 
    
         | 
| 
       63 
65 
     | 
    
         
             
            PLATFORMS
         
     | 
| 
       64 
66 
     | 
    
         
             
              arm64-darwin-22
         
     | 
| 
      
 67 
     | 
    
         
            +
              x86_64-darwin-22
         
     | 
| 
       65 
68 
     | 
    
         
             
              x86_64-linux
         
     | 
| 
       66 
69 
     | 
    
         | 
| 
       67 
70 
     | 
    
         
             
            DEPENDENCIES
         
     | 
| 
       68 
     | 
    
         
            -
              pry (~> 0.14.2)
         
     | 
| 
       69 
71 
     | 
    
         
             
              rake (~> 13.0)
         
     | 
| 
       70 
72 
     | 
    
         
             
              rake-compiler
         
     | 
| 
       71 
73 
     | 
    
         
             
              rb_sys
         
     | 
| 
       72 
74 
     | 
    
         
             
              rspec (~> 3.0)
         
     | 
| 
       73 
75 
     | 
    
         
             
              standard (~> 1.3)
         
     | 
| 
       74 
76 
     | 
    
         
             
              tiktoken_ruby!
         
     | 
| 
      
 77 
     | 
    
         
            +
              yard-doctest (~> 0.1.17)
         
     | 
| 
       75 
78 
     | 
    
         | 
| 
       76 
79 
     | 
    
         
             
            BUNDLED WITH
         
     | 
| 
       77 
80 
     | 
    
         
             
               2.4.6
         
     | 
    
        data/README.md
    CHANGED
    
    | 
         @@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin 
     | 
|
| 
       15 
15 
     | 
    
         
             
                $ gem install tiktoken_ruby
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
17 
     | 
    
         
             
            ## Usage
         
     | 
| 
      
 18 
     | 
    
         
            +
            Usage should be very similar to the python library. Here's a simple example
         
     | 
| 
       18 
19 
     | 
    
         | 
| 
      
 20 
     | 
    
         
            +
            Encode and decode text
         
     | 
| 
       19 
21 
     | 
    
         
             
            ```ruby
         
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
      
 22 
     | 
    
         
            +
            require 'tiktoken_ruby'
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
            enc = Tiktoken.get_encoding("cl100k_base")
         
     | 
| 
      
 25 
     | 
    
         
            +
            enc.decode(enc.encode("hello world")) #=> "hello world"
         
     | 
| 
      
 26 
     | 
    
         
            +
            ```
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
            Encoders can also be retrieved by model name
         
     | 
| 
      
 29 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 30 
     | 
    
         
            +
            require 'tiktoken_ruby'
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            enc = Tiktoken.encoding_for_model("gpt-4")
         
     | 
| 
      
 33 
     | 
    
         
            +
            enc.encode("hello world").length #=> 2
         
     | 
| 
       23 
34 
     | 
    
         
             
            ```
         
     | 
| 
       24 
35 
     | 
    
         | 
| 
       25 
36 
     | 
    
         
             
            ## Development
         
     | 
| 
         @@ -32,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To 
     | 
|
| 
       32 
43 
     | 
    
         | 
| 
       33 
44 
     | 
    
         
             
            Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
         
     | 
| 
       34 
45 
     | 
    
         | 
| 
      
 46 
     | 
    
         
            +
            To get started with development:
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
            ```sh
         
     | 
| 
      
 49 
     | 
    
         
            +
            git clone https://github.com/IAPark/tiktoken_ruby.git
         
     | 
| 
      
 50 
     | 
    
         
            +
            cd tiktoken_ruby
         
     | 
| 
      
 51 
     | 
    
         
            +
            bundle install
         
     | 
| 
      
 52 
     | 
    
         
            +
            bundle exec rake compile
         
     | 
| 
      
 53 
     | 
    
         
            +
            bundle exec rake spec
         
     | 
| 
      
 54 
     | 
    
         
            +
            ```
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
       35 
57 
     | 
    
         
             
            ## License
         
     | 
| 
       36 
58 
     | 
    
         | 
| 
       37 
59 
     | 
    
         
             
            The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
         
     | 
    
        data/Rakefile
    CHANGED
    
    
    
        data/doctest_helper.rb
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "lib/tiktoken_ruby"
         
     | 
| 
         Binary file 
     | 
| 
         Binary file 
     | 
| 
         Binary file 
     | 
| 
         Binary file 
     | 
| 
         @@ -1,23 +1,52 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
            class Tiktoken::Encoding 
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
                 
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
                 
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
      
 3 
     | 
    
         
            +
            class Tiktoken::Encoding
         
     | 
| 
      
 4 
     | 
    
         
            +
              attr_reader :name
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
              # This returns a new Tiktoken::Encoding instance for the requested encoding
         
     | 
| 
      
 7 
     | 
    
         
            +
              # @param encoding [Symbol] The name of the encoding to load
         
     | 
| 
      
 8 
     | 
    
         
            +
              # @return [Tiktoken::Encoding] The encoding instance
         
     | 
| 
      
 9 
     | 
    
         
            +
              def self.for_name(encoding)
         
     | 
| 
      
 10 
     | 
    
         
            +
                Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
              # This returns a Tiktoken::Encoding instance for the requested encoding
         
     | 
| 
      
 14 
     | 
    
         
            +
              # It will reuse an existing encoding if it's already been loaded
         
     | 
| 
      
 15 
     | 
    
         
            +
              # @param encoding [Symbol] The name of the encoding to load
         
     | 
| 
      
 16 
     | 
    
         
            +
              # @return [Tiktoken::Encoding] The encoding instance
         
     | 
| 
      
 17 
     | 
    
         
            +
              def self.for_name_cached(encoding)
         
     | 
| 
      
 18 
     | 
    
         
            +
                @encodings ||= {}
         
     | 
| 
      
 19 
     | 
    
         
            +
                @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
         
     | 
| 
      
 20 
     | 
    
         
            +
              end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
              # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
         
     | 
| 
      
 23 
     | 
    
         
            +
              # basically it's unescaped
         
     | 
| 
      
 24 
     | 
    
         
            +
              # @param text [String] The text to encode
         
     | 
| 
      
 25 
     | 
    
         
            +
              # @return [Array<Integer>] The encoded tokens
         
     | 
| 
      
 26 
     | 
    
         
            +
              def encode_ordinary(text)
         
     | 
| 
      
 27 
     | 
    
         
            +
                @ext_base_bpe.encode_ordinary(text)
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
              # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
         
     | 
| 
      
 31 
     | 
    
         
            +
              # as text unless they're in the allowed_special array. It's basically like the text was escaped
         
     | 
| 
      
 32 
     | 
    
         
            +
              # @param text [String] The text to encode
         
     | 
| 
      
 33 
     | 
    
         
            +
              # @param allowed_special [Array<String>] An array of special tokens to allow
         
     | 
| 
      
 34 
     | 
    
         
            +
              # @return [Array<Integer>] The encoded tokens
         
     | 
| 
      
 35 
     | 
    
         
            +
              def encode(text, allowed_special: [])
         
     | 
| 
      
 36 
     | 
    
         
            +
                @ext_base_bpe.encode(text, allowed_special)
         
     | 
| 
      
 37 
     | 
    
         
            +
              end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
              # Decodes the tokens back into text
         
     | 
| 
      
 40 
     | 
    
         
            +
              # @param tokens [Array<Integer>] The tokens to decode
         
     | 
| 
      
 41 
     | 
    
         
            +
              # @return [String] The decoded text
         
     | 
| 
      
 42 
     | 
    
         
            +
              def decode(tokens)
         
     | 
| 
      
 43 
     | 
    
         
            +
                @ext_base_bpe.decode(tokens)
         
     | 
| 
      
 44 
     | 
    
         
            +
              end
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
              private
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
              def initialize(ext_base_bpe, name)
         
     | 
| 
      
 49 
     | 
    
         
            +
                @ext_base_bpe = ext_base_bpe
         
     | 
| 
      
 50 
     | 
    
         
            +
                @name = name
         
     | 
| 
      
 51 
     | 
    
         
            +
              end
         
     | 
| 
       23 
52 
     | 
    
         
             
            end
         
     | 
    
        data/lib/tiktoken_ruby.rb
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # frozen_string_literal: true
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require_relative "tiktoken_ruby/version"
         
     | 
| 
       4 
     | 
    
         
            -
            require_relative "tiktoken_ruby/encoding 
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative "tiktoken_ruby/encoding"
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            begin
         
     | 
| 
       7 
7 
     | 
    
         
             
              RUBY_VERSION =~ /(\d+\.\d+)/
         
     | 
| 
         @@ -11,5 +11,104 @@ rescue LoadError 
     | 
|
| 
       11 
11 
     | 
    
         
             
            end
         
     | 
| 
       12 
12 
     | 
    
         | 
| 
       13 
13 
     | 
    
         
             
            module Tiktoken
         
     | 
| 
       14 
     | 
    
         
            -
              class  
     | 
| 
      
 14 
     | 
    
         
            +
              class << self
         
     | 
| 
      
 15 
     | 
    
         
            +
                # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
         
     | 
| 
      
 16 
     | 
    
         
            +
                # it will reuse the instance of that type that was previous loaded
         
     | 
| 
      
 17 
     | 
    
         
            +
                # @param name [Symbol|String] The name of the encoding to load
         
     | 
| 
      
 18 
     | 
    
         
            +
                # @return [Tiktoken::Encoding] The encoding instance
         
     | 
| 
      
 19 
     | 
    
         
            +
                # @example Encode and decode text
         
     | 
| 
      
 20 
     | 
    
         
            +
                #   enc = Tiktoken.get_encoding("cl100k_base")
         
     | 
| 
      
 21 
     | 
    
         
            +
                #   enc.decode(enc.encode("hello world")) #=> "hello world"
         
     | 
| 
      
 22 
     | 
    
         
            +
                def get_encoding(name)
         
     | 
| 
      
 23 
     | 
    
         
            +
                  name = name.to_sym
         
     | 
| 
      
 24 
     | 
    
         
            +
                  return nil unless SUPPORTED_ENCODINGS.include?(name)
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                  Tiktoken::Encoding.for_name_cached(name)
         
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                # Gets the encoding for an OpenAI model
         
     | 
| 
      
 30 
     | 
    
         
            +
                # @param model_name [Symbol|String] The name of the model to get the encoding for
         
     | 
| 
      
 31 
     | 
    
         
            +
                # @return [Tiktoken::Encoding] The encoding instance
         
     | 
| 
      
 32 
     | 
    
         
            +
                # @example Count tokens for text
         
     | 
| 
      
 33 
     | 
    
         
            +
                #   enc = Tiktoken.encoding_for_model("gpt-4")
         
     | 
| 
      
 34 
     | 
    
         
            +
                #   enc.encode("hello world").length #=> 2
         
     | 
| 
      
 35 
     | 
    
         
            +
                def encoding_for_model(model_name)
         
     | 
| 
      
 36 
     | 
    
         
            +
                  PREFIX_MODELS.each do |prefix|
         
     | 
| 
      
 37 
     | 
    
         
            +
                    if model_name.to_s.start_with?("#{prefix}-")
         
     | 
| 
      
 38 
     | 
    
         
            +
                      model_name = prefix
         
     | 
| 
      
 39 
     | 
    
         
            +
                      break
         
     | 
| 
      
 40 
     | 
    
         
            +
                    end
         
     | 
| 
      
 41 
     | 
    
         
            +
                  end
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                  encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
         
     | 
| 
      
 44 
     | 
    
         
            +
                  return nil unless encoding_name
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                  get_encoding(encoding_name)
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                # Lists all the encodings that are supported
         
     | 
| 
      
 50 
     | 
    
         
            +
                # @return [Array<Symbol>] The list of supported encodings
         
     | 
| 
      
 51 
     | 
    
         
            +
                def list_encoding_names
         
     | 
| 
      
 52 
     | 
    
         
            +
                  SUPPORTED_ENCODINGS
         
     | 
| 
      
 53 
     | 
    
         
            +
                end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                # Lists all the models that are supported
         
     | 
| 
      
 56 
     | 
    
         
            +
                # @return [Array<Symbol>] The list of supported models
         
     | 
| 
      
 57 
     | 
    
         
            +
                def list_model_names
         
     | 
| 
      
 58 
     | 
    
         
            +
                  MODEL_TO_ENCODING_NAME.keys
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
                private
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                SUPPORTED_ENCODINGS = [
         
     | 
| 
      
 64 
     | 
    
         
            +
                  :r50k_base,
         
     | 
| 
      
 65 
     | 
    
         
            +
                  :p50k_base,
         
     | 
| 
      
 66 
     | 
    
         
            +
                  :p50k_edit,
         
     | 
| 
      
 67 
     | 
    
         
            +
                  :cl100k_base
         
     | 
| 
      
 68 
     | 
    
         
            +
                ]
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
         
     | 
| 
      
 71 
     | 
    
         
            +
                # that is also MIT licensed but by OpenAI
         
     | 
| 
      
 72 
     | 
    
         
            +
                MODEL_TO_ENCODING_NAME = {
         
     | 
| 
      
 73 
     | 
    
         
            +
                  "gpt-4": "cl100k_base",
         
     | 
| 
      
 74 
     | 
    
         
            +
                  "gpt-3.5-turbo": "cl100k_base",
         
     | 
| 
      
 75 
     | 
    
         
            +
                  # text
         
     | 
| 
      
 76 
     | 
    
         
            +
                  "text-davinci-003": "p50k_base",
         
     | 
| 
      
 77 
     | 
    
         
            +
                  "text-davinci-002": "p50k_base",
         
     | 
| 
      
 78 
     | 
    
         
            +
                  "text-davinci-001": "r50k_base",
         
     | 
| 
      
 79 
     | 
    
         
            +
                  "text-curie-001": "r50k_base",
         
     | 
| 
      
 80 
     | 
    
         
            +
                  "text-babbage-001": "r50k_base",
         
     | 
| 
      
 81 
     | 
    
         
            +
                  "text-ada-001": "r50k_base",
         
     | 
| 
      
 82 
     | 
    
         
            +
                  davinci: "r50k_base",
         
     | 
| 
      
 83 
     | 
    
         
            +
                  curie: "r50k_base",
         
     | 
| 
      
 84 
     | 
    
         
            +
                  babbage: "r50k_base",
         
     | 
| 
      
 85 
     | 
    
         
            +
                  ada: "r50k_base",
         
     | 
| 
      
 86 
     | 
    
         
            +
                  # code
         
     | 
| 
      
 87 
     | 
    
         
            +
                  "code-davinci-002": "p50k_base",
         
     | 
| 
      
 88 
     | 
    
         
            +
                  "code-davinci-001": "p50k_base",
         
     | 
| 
      
 89 
     | 
    
         
            +
                  "code-cushman-002": "p50k_base",
         
     | 
| 
      
 90 
     | 
    
         
            +
                  "code-cushman-001": "p50k_base",
         
     | 
| 
      
 91 
     | 
    
         
            +
                  "davinci-codex": "p50k_base",
         
     | 
| 
      
 92 
     | 
    
         
            +
                  "cushman-codex": "p50k_base",
         
     | 
| 
      
 93 
     | 
    
         
            +
                  # edit
         
     | 
| 
      
 94 
     | 
    
         
            +
                  "text-davinci-edit-001": "p50k_edit",
         
     | 
| 
      
 95 
     | 
    
         
            +
                  "code-davinci-edit-001": "p50k_edit",
         
     | 
| 
      
 96 
     | 
    
         
            +
                  # embeddings
         
     | 
| 
      
 97 
     | 
    
         
            +
                  "text-embedding-ada-002": "cl100k_base",
         
     | 
| 
      
 98 
     | 
    
         
            +
                  # old embeddings
         
     | 
| 
      
 99 
     | 
    
         
            +
                  "text-similarity-davinci-001": "r50k_base",
         
     | 
| 
      
 100 
     | 
    
         
            +
                  "text-similarity-curie-001": "r50k_base",
         
     | 
| 
      
 101 
     | 
    
         
            +
                  "text-similarity-babbage-001": "r50k_base",
         
     | 
| 
      
 102 
     | 
    
         
            +
                  "text-similarity-ada-001": "r50k_base",
         
     | 
| 
      
 103 
     | 
    
         
            +
                  "text-search-davinci-doc-001": "r50k_base",
         
     | 
| 
      
 104 
     | 
    
         
            +
                  "text-search-curie-doc-001": "r50k_base",
         
     | 
| 
      
 105 
     | 
    
         
            +
                  "text-search-babbage-doc-001": "r50k_base",
         
     | 
| 
      
 106 
     | 
    
         
            +
                  "text-search-ada-doc-001": "r50k_base",
         
     | 
| 
      
 107 
     | 
    
         
            +
                  "code-search-babbage-code-001": "r50k_base",
         
     | 
| 
      
 108 
     | 
    
         
            +
                  "code-search-ada-code-001": "r50k_base"
         
     | 
| 
      
 109 
     | 
    
         
            +
                }
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
                # these are models that have a versioned models that are otherwise identical
         
     | 
| 
      
 112 
     | 
    
         
            +
                PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
         
     | 
| 
      
 113 
     | 
    
         
            +
              end
         
     | 
| 
       15 
114 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,16 +1,18 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: tiktoken_ruby
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.4
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: x86_64-linux
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - IAPark
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2023-03- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2023-03-28 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       13 
     | 
    
         
            -
            description:  
     | 
| 
      
 13 
     | 
    
         
            +
            description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
         
     | 
| 
      
 14 
     | 
    
         
            +
              used by OpenAI. It can be used to count the number of tokens in text before sending
         
     | 
| 
      
 15 
     | 
    
         
            +
              it to OpenAI APIs.
         
     | 
| 
       14 
16 
     | 
    
         
             
            email:
         
     | 
| 
       15 
17 
     | 
    
         
             
            - isaac.a.park@gmail.com
         
     | 
| 
       16 
18 
     | 
    
         
             
            executables: []
         
     | 
| 
         @@ -24,6 +26,7 @@ files: 
     | 
|
| 
       24 
26 
     | 
    
         
             
            - LICENSE.txt
         
     | 
| 
       25 
27 
     | 
    
         
             
            - README.md
         
     | 
| 
       26 
28 
     | 
    
         
             
            - Rakefile
         
     | 
| 
      
 29 
     | 
    
         
            +
            - doctest_helper.rb
         
     | 
| 
       27 
30 
     | 
    
         
             
            - lib/tiktoken_ruby.rb
         
     | 
| 
       28 
31 
     | 
    
         
             
            - lib/tiktoken_ruby/2.7/tiktoken_ruby.so
         
     | 
| 
       29 
32 
     | 
    
         
             
            - lib/tiktoken_ruby/3.0/tiktoken_ruby.so
         
     | 
| 
         @@ -38,6 +41,7 @@ licenses: 
     | 
|
| 
       38 
41 
     | 
    
         
             
            metadata:
         
     | 
| 
       39 
42 
     | 
    
         
             
              homepage_uri: https://github.com/IAPark/tiktoken_ruby
         
     | 
| 
       40 
43 
     | 
    
         
             
              source_code_uri: https://github.com/IAPark/tiktoken_ruby
         
     | 
| 
      
 44 
     | 
    
         
            +
              documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
         
     | 
| 
       41 
45 
     | 
    
         
             
            post_install_message: 
         
     | 
| 
       42 
46 
     | 
    
         
             
            rdoc_options: []
         
     | 
| 
       43 
47 
     | 
    
         
             
            require_paths:
         
     |