RubyGems - tiktoken_ruby - Versions diffs - 0.0.2-x64-mingw-ucrt → 0.0.3-x64-mingw-ucrt - Mend

tiktoken_ruby 0.0.2-x64-mingw-ucrt → 0.0.3-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/Gemfile.lock +9 -1
data/README.md +14 -3
data/doctest_helper.rb +1 -0
data/lib/tiktoken_ruby/3.1/tiktoken_ruby.so +0 -0
data/lib/tiktoken_ruby/3.2/tiktoken_ruby.so +0 -0
data/lib/tiktoken_ruby/encoding.rb +33 -5
data/lib/tiktoken_ruby/version.rb +1 -1
data/lib/tiktoken_ruby.rb +101 -1
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8a76f3a21d5c1994e02b643393862b4d4183c40d5c2ad004cda1280b7ca4c3d9
-  data.tar.gz: fa20012e669bcd1ef57e6781488405c0436b65a960270a18e2603b1610278950
+  metadata.gz: 0fc02070b46f894e41faf76d01edb3448bcf9fec33d8ce9c84a066f3b6a3c42c
+  data.tar.gz: d3059efb8fb610aabf5a32427d6d82040882b1384c046cdbd8c8f556babbfd54
 SHA512:
-  metadata.gz: db8f73e134a5fdb08e8dbb520e01b62829f373cc7f6073ca80928e6deeb5e4fee30d9fb57dbb4f691b570edaafad7eb6e507ead8d588b975a1a1ebbe477e8489
-  data.tar.gz: 8f89833ea89e23fdc50d71ca927d69bc15ec45dc81e399e3014f66b854b039dc51bbb3faa5f468b478438aa34aa09cd4f2c7f6c7702e04dcee9b734e72a2a12f
+  metadata.gz: 1b5826fa7ca9377abc35a5c4dd985afa2960110def15b992d350363b9fe66340d27e3ee9845abd4ba2571e9bec6777eb0d46902cb4b88c5e1ff3fc9ef0ad2346
+  data.tar.gz: f4613b799248700b53e24eaa7e87f3cca774921955b3db8fd5cddecc027b6df37924daa5dabed55943e89bb0bc6d78d7528e6c33eb46dac5ecaa2f1a868af019

data/Gemfile CHANGED Viewed

@@ -14,3 +14,5 @@ gem "rspec", "~> 3.0"
 gem "standard", "~> 1.3"
 gem 'pry', '~> 0.14.2'
+gem "yard-doctest", "~> 0.1.17"

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    tiktoken_ruby (0.0.2)
+    tiktoken_ruby (0.0.3)
 GEM
   remote: https://rubygems.org/
@@ -12,6 +12,7 @@ GEM
     json (2.6.3)
     language_server-protocol (3.17.0.3)
     method_source (1.0.0)
+    minitest (5.18.0)
     parallel (1.22.1)
     parser (3.2.1.1)
       ast (~> 2.4.1)
@@ -59,6 +60,12 @@ GEM
       rubocop (= 1.48.1)
       rubocop-performance (= 1.16.0)
     unicode-display_width (2.4.2)
+    webrick (1.7.0)
+    yard (0.9.28)
+      webrick (~> 1.7.0)
+    yard-doctest (0.1.17)
+      minitest
+      yard
 PLATFORMS
   arm64-darwin-22
@@ -72,6 +79,7 @@ DEPENDENCIES
   rspec (~> 3.0)
   standard (~> 1.3)
   tiktoken_ruby!
+  yard-doctest (~> 0.1.17)
 BUNDLED WITH
    2.4.6

data/README.md CHANGED Viewed

@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
     $ gem install tiktoken_ruby
 ## Usage
+Usage should be very similar to the python library. here's a simple example
+Encode and decode text
 ```ruby
-encoding = Tiktoken::Encoding.r50k_base
-tokens = encoding.encode("Hello world!")
-puts encoding.decode(tokens)
+require 'tiktoken_ruby'
+enc = Tiktoken.get_encoding("cl100k_base")
+enc.decode(enc.encode("hello world")) #=> "hello world"
+```
+Encoders can also be retrieved by model name
+```ruby
+require 'tiktoken_ruby'
+enc = Tiktoken.encoding_for_model("gpt-4")
+enc.encode("hello world").length #=> 2
 ```
 ## Development

data/doctest_helper.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'lib/tiktoken_ruby.rb'

data/lib/tiktoken_ruby/3.1/tiktoken_ruby.so CHANGED Viewed

Binary file

data/lib/tiktoken_ruby/3.2/tiktoken_ruby.so CHANGED Viewed

Binary file

data/lib/tiktoken_ruby/encoding.rb CHANGED Viewed

@@ -1,23 +1,51 @@
 # frozen_string_literal: true
-class Tiktoken::Encoding
-    def self.method_missing(method)
-        Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(method))
+class Tiktoken::Encoding
+    attr_reader :name
+    # This returns a new Tiktoken::Encoding instance for the requested encoding
+    # @param encoding [Symbol] The name of the encoding to load
+    # @return [Tiktoken::Encoding] The encoding instance
+    def self.for_name(encoding)
+        Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
     end
-    def initialize(ext_base_bpe)
-        @ext_base_bpe = ext_base_bpe
+    # This returns a Tiktoken::Encoding instance for the requested encoding
+    # It will reuse an existing encoding if it's already been loaded
+    # @param encoding [Symbol] The name of the encoding to load
+    # @return [Tiktoken::Encoding] The encoding instance
+    def self.for_name_cached(encoding)
+        @encodings ||= {}
+        @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
     end
+    # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
+    # basically it's unescaped
+    # @param text [String] The text to encode
+    # @return [Array<Integer>] The encoded tokens
     def encode_ordinary(text)
         @ext_base_bpe.encode_ordinary(text)
     end
+    # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
+    # as text unless they're in the allowed_special array. It's basically like the text was escaped
+    # @param text [String] The text to encode
+    # @param allowed_special [Array<String>] An array of special tokens to allow
+    # @return [Array<Integer>] The encoded tokens
     def encode(text, allowed_special: [])
         @ext_base_bpe.encode(text, allowed_special)
     end
+    # Decodes the tokens back into text
+    # @param tokens [Array<Integer>] The tokens to decode
+    # @return [String] The decoded text
     def decode(tokens)
         @ext_base_bpe.decode(tokens)
     end
+    private
+    def initialize(ext_base_bpe, name)
+        @ext_base_bpe = ext_base_bpe
+        @name = name
+    end
 end

data/lib/tiktoken_ruby/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Tiktoken
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/lib/tiktoken_ruby.rb CHANGED Viewed

@@ -11,5 +11,105 @@ rescue LoadError
 end
 module Tiktoken
-  class Error < StandardError; end
+  class << self
+    # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
+    # it will reuse the instance of that type that was previous loaded
+    # @param name [Symbol|String] The name of the encoding to load
+    # @return [Tiktoken::Encoding] The encoding instance
+    # @example Encode and decode text
+    #   enc = Tiktoken.get_encoding("cl100k_base")
+    #   enc.decode(enc.encode("hello world")) #=> "hello world"
+    def get_encoding(name)
+      name = name.to_sym
+      return nil unless SUPPORTED_ENCODINGS.include?(name)
+      Tiktoken::Encoding.for_name_cached(name)
+    end
+    # Gets the encoding for an OpenAI model
+    # @param model_name [Symbol|String] The name of the model to get the encoding for
+    # @return [Tiktoken::Encoding] The encoding instance
+    # @example Count tokens for text
+    #   enc = Tiktoken.encoding_for_model("gpt-4")
+    #   enc.encode("hello world").length #=> 2
+    def encoding_for_model(model_name)
+      for prefix in PREFIX_MODELS
+        if model_name.to_s.start_with?("#{prefix}-")
+          model_name = prefix
+          break
+        end
+      end
+      encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
+      return nil unless encoding_name
+      get_encoding(encoding_name)
+    end
+    # Lists all the encodings that are supported
+    # @return [Array<Symbol>] The list of supported encodings
+    def list_encoding_names
+      SUPPORTED_ENCODINGS
+    end
+    # Lists all the models that are supported
+    # @return [Array<Symbol>] The list of supported models
+    def list_model_names
+      MODEL_TO_ENCODING_NAME.keys
+    end
+    private
+    SUPPORTED_ENCODINGS = [
+      :r50k_base,
+      :p50k_base,
+      :p50k_edit,
+      :cl100k_base,
+    ]
+    # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
+    # that is also MIT licensed but by OpenAI
+    MODEL_TO_ENCODING_NAME = {
+      "gpt-4": "cl100k_base",
+      "gpt-3.5-turbo": "cl100k_base",
+      # text
+      "text-davinci-003": "p50k_base",
+      "text-davinci-002": "p50k_base",
+      "text-davinci-001": "r50k_base",
+      "text-curie-001": "r50k_base",
+      "text-babbage-001": "r50k_base",
+      "text-ada-001": "r50k_base",
+      "davinci": "r50k_base",
+      "curie": "r50k_base",
+      "babbage": "r50k_base",
+      "ada": "r50k_base",
+      # code
+      "code-davinci-002": "p50k_base",
+      "code-davinci-001": "p50k_base",
+      "code-cushman-002": "p50k_base",
+      "code-cushman-001": "p50k_base",
+      "davinci-codex": "p50k_base",
+      "cushman-codex": "p50k_base",
+      # edit
+      "text-davinci-edit-001": "p50k_edit",
+      "code-davinci-edit-001": "p50k_edit",
+      # embeddings
+      "text-embedding-ada-002": "cl100k_base",
+      # old embeddings
+      "text-similarity-davinci-001": "r50k_base",
+      "text-similarity-curie-001": "r50k_base",
+      "text-similarity-babbage-001": "r50k_base",
+      "text-similarity-ada-001": "r50k_base",
+      "text-search-davinci-doc-001": "r50k_base",
+      "text-search-curie-doc-001": "r50k_base",
+      "text-search-babbage-doc-001": "r50k_base",
+      "text-search-ada-doc-001": "r50k_base",
+      "code-search-babbage-code-001": "r50k_base",
+      "code-search-ada-code-001": "r50k_base",
+    }
+    # these are models that have a versioned models that are otherwise identical
+    PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
+  end
 end

metadata CHANGED Viewed

@@ -1,16 +1,18 @@
 --- !ruby/object:Gem::Specification
 name: tiktoken_ruby
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: x64-mingw-ucrt
 authors:
 - IAPark
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-03-19 00:00:00.000000000 Z
+date: 2023-03-21 00:00:00.000000000 Z
 dependencies: []
-description: Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings
+description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
+  used by OpenAI. It can be used to count the number of tokens in text before sending
+  it to OpenAI APIs.
 email:
 - isaac.a.park@gmail.com
 executables: []
@@ -24,6 +26,7 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- doctest_helper.rb
 - lib/tiktoken_ruby.rb
 - lib/tiktoken_ruby/3.1/tiktoken_ruby.so
 - lib/tiktoken_ruby/3.2/tiktoken_ruby.so
@@ -36,6 +39,7 @@ licenses:
 metadata:
   homepage_uri: https://github.com/IAPark/tiktoken_ruby
   source_code_uri: https://github.com/IAPark/tiktoken_ruby
+  documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
 post_install_message:
 rdoc_options: []
 require_paths: