RubyGems - riktoken - Versions diffs - 0.0.1 - Mend

riktoken 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +7 -0
data/.rubocop.yml +10 -0
data/CHANGELOG.md +5 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE.txt +21 -0
data/README.md +205 -0
data/Rakefile +24 -0
data/lib/riktoken/bpe.rb +139 -0
data/lib/riktoken/encoding.rb +52 -0
data/lib/riktoken/encodings/cl100k_base.rb +35 -0
data/lib/riktoken/encodings/o200k_base.rb +40 -0
data/lib/riktoken/encodings/p50k_base.rb +31 -0
data/lib/riktoken/encodings/p50k_edit.rb +35 -0
data/lib/riktoken/encodings/r50k_base.rb +31 -0
data/lib/riktoken/encodings.rb +28 -0
data/lib/riktoken/tiktoken_file.rb +42 -0
data/lib/riktoken/version.rb +5 -0
data/lib/riktoken.rb +155 -0
data/renovate.json +6 -0
data/sig/generated/riktoken/bpe.rbs +55 -0
data/sig/generated/riktoken/encoding.rbs +34 -0
data/sig/generated/riktoken/encodings/cl100k_base.rbs +15 -0
data/sig/generated/riktoken/encodings/o200k_base.rbs +15 -0
data/sig/generated/riktoken/encodings/p50k_base.rbs +15 -0
data/sig/generated/riktoken/encodings/p50k_edit.rbs +17 -0
data/sig/generated/riktoken/encodings/r50k_base.rbs +15 -0
data/sig/generated/riktoken/encodings.rbs +18 -0
data/sig/generated/riktoken/tiktoken_file.rbs +15 -0
data/sig/generated/riktoken/version.rbs +5 -0
data/sig/generated/riktoken.rbs +55 -0
metadata +87 -0

data/lib/riktoken/encodings/p50k_base.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+require_relative "../encodings"
+module Riktoken
+  module Encodings
+    module P50kBase
+      include Riktoken::Encodings
+      ENCODING_NAME = "p50k_base"
+      private_constant :ENCODING_NAME
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding(tiktoken_base_dir:)
+        ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
+        special_tokens = {
+          "<|endoftext|>" => 50256
+        }
+        pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/
+        Riktoken::Encoding.new(
+          name: ENCODING_NAME,
+          ranks: ranks,
+          special_tokens: special_tokens,
+          pattern: pattern
+        )
+      end
+    end
+  end
+end

data/lib/riktoken/encodings/p50k_edit.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+require_relative "../encodings"
+module Riktoken
+  module Encodings
+    module P50kEdit
+      include Riktoken::Encodings
+      ENCODING_NAME = "p50k_edit"
+      TIKTOKEN_SIGNATURE_NAME = "p50k_base"
+      private_constant :ENCODING_NAME
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding(tiktoken_base_dir:)
+        ranks = TiktokenFile.new.load(find_tiktoken_file(name: TIKTOKEN_SIGNATURE_NAME, base_dir: tiktoken_base_dir))
+        special_tokens = {
+          "<|endoftext|>" => 50256,
+          "<|fim_prefix|>" => 50281,
+          "<|fim_middle|>" => 50282,
+          "<|fim_suffix|>" => 50283
+        }
+        pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/
+        Riktoken::Encoding.new(
+          name: ENCODING_NAME,
+          ranks: ranks,
+          special_tokens: special_tokens,
+          pattern: pattern
+        )
+      end
+    end
+  end
+end

data/lib/riktoken/encodings/r50k_base.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+require_relative "../encodings"
+module Riktoken
+  module Encodings
+    module R50kBase
+      include Riktoken::Encodings
+      ENCODING_NAME = "r50k_base"
+      private_constant :ENCODING_NAME
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding(tiktoken_base_dir:)
+        ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
+        special_tokens = {
+          "<|endoftext|>" => 50256
+        }
+        pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/
+        Riktoken::Encoding.new(
+          name: ENCODING_NAME,
+          ranks: ranks,
+          special_tokens: special_tokens,
+          pattern: pattern
+        )
+      end
+    end
+  end
+end

data/lib/riktoken/encodings.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module Riktoken
+  module Encodings
+    class FileNotFoundError < StandardError; end
+    def self.included(base)
+      base.extend(ClassMethods)
+      base.private_class_method :find_tiktoken_file
+    end
+    module ClassMethods
+      # Look for .tiktoken file in common locations
+      # @rbs name: String
+      # @rbs base_dir: String -- a directory to find the tiktoken file
+      # @rbs return: String
+      def find_tiktoken_file(name:, base_dir:)
+        path = File.join(base_dir, "#{name}.tiktoken")
+        if File.exist?(path)
+          path
+        else
+          raise FileNotFoundError, "tiktoken file not found: #{path}"
+        end
+      end
+    end
+  end
+end

data/lib/riktoken/tiktoken_file.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+require "base64"
+module Riktoken
+  class TiktokenFile
+    class ParseError < StandardError; end
+    # Parses a .tiktoken file content and returns a hash mapping base64-encoded tokens to their ranks.
+    # @rbs content: String
+    # @rbs return: Hash[String, Integer]
+    def parse(content)
+      ranks = {}
+      content.each_line do |line|
+        line = line.strip
+        next if line.empty? || line.start_with?("#")
+        parts = line.split(/\s+/)
+        if parts.length != 2
+          raise ParseError, "Invalid line format: #{line}"
+        end
+        begin
+          token = Base64.strict_decode64(parts[0])
+          rank = Integer(parts[1])
+          ranks[token] = rank
+        rescue ArgumentError => e
+          raise ParseError, "Failed to parse line: #{line} - #{e.message}"
+        end
+      end
+      ranks
+    end
+    def load(path)
+      content = File.read(path, encoding: "UTF-8")
+      parse(content)
+    end
+  end
+end

data/lib/riktoken/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Riktoken
+  VERSION = "0.0.1"
+end

data/lib/riktoken.rb ADDED Viewed

@@ -0,0 +1,155 @@
+# frozen_string_literal: true
+require_relative "riktoken/version"
+require_relative "riktoken/encoding"
+require_relative "riktoken/tiktoken_file"
+require_relative "riktoken/encodings/cl100k_base"
+require_relative "riktoken/encodings/p50k_base"
+require_relative "riktoken/encodings/p50k_edit"
+require_relative "riktoken/encodings/r50k_base"
+require_relative "riktoken/encodings/o200k_base"
+module Riktoken
+  # @rbs!
+  #   type rank = Integer
+  #   type tuple[T, U] = [T, U]
+  class UnknownEncodingError < StandardError; end
+  class UnknownModelError < StandardError; end
+  MODEL_TO_ENCODING = {
+    # GPT-4 models
+    "gpt-4" => "cl100k_base",
+    "gpt-4-0314" => "cl100k_base",
+    "gpt-4-0613" => "cl100k_base",
+    "gpt-4-32k" => "cl100k_base",
+    "gpt-4-32k-0314" => "cl100k_base",
+    "gpt-4-32k-0613" => "cl100k_base",
+    # GPT-3.5 models
+    "gpt-3.5-turbo" => "cl100k_base",
+    "gpt-3.5-turbo-0301" => "cl100k_base",
+    "gpt-3.5-turbo-0613" => "cl100k_base",
+    "gpt-3.5-turbo-16k" => "cl100k_base",
+    "gpt-3.5-turbo-16k-0613" => "cl100k_base",
+    # Legacy models
+    "text-davinci-003" => "p50k_base",
+    "text-davinci-002" => "p50k_base",
+    "text-davinci-001" => "r50k_base",
+    "text-curie-001" => "r50k_base",
+    "text-babbage-001" => "r50k_base",
+    "text-ada-001" => "r50k_base",
+    "davinci" => "r50k_base",
+    "curie" => "r50k_base",
+    "babbage" => "r50k_base",
+    "ada" => "r50k_base",
+    # Code models
+    "code-davinci-002" => "p50k_base",
+    "code-davinci-001" => "p50k_base",
+    "code-cushman-002" => "p50k_base",
+    "code-cushman-001" => "p50k_base",
+    "davinci-codex" => "p50k_base",
+    "cushman-codex" => "p50k_base",
+    # Edit models
+    "text-davinci-edit-001" => "p50k_edit",
+    "code-davinci-edit-001" => "p50k_edit",
+    # Embeddings
+    "text-embedding-ada-002" => "cl100k_base",
+    # GPT-4o models
+    "gpt-4o" => "o200k_base",
+    "gpt-4o-mini" => "o200k_base"
+  }.freeze
+  DEFAULT_TIKTOKEN_BASE_DIR = File.join(Dir.home, ".riktoken").freeze
+  TIKTOKEN_BASE_DIR_ENV_KEY = "TIKTOKEN_BASE_DIR"
+  private_constant :MODEL_TO_ENCODING, :DEFAULT_TIKTOKEN_BASE_DIR, :TIKTOKEN_BASE_DIR_ENV_KEY
+  class << self
+    # Get the encoding by name (like "cl100k_base").
+    # @rbs encoding_name: String
+    # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
+    # @rbs return: Encoding
+    def get_encoding(encoding_name, tiktoken_base_dir: default_tiktoken_base_dir)
+      enc_class = case encoding_name
+      when "cl100k_base"
+        Encodings::Cl100kBase
+      when "p50k_base"
+        Encodings::P50kBase
+      when "p50k_edit"
+        Encodings::P50kEdit
+      when "r50k_base"
+        Encodings::R50kBase
+      when "o200k_base"
+        Encodings::O200kBase
+      else
+        raise UnknownEncodingError, "Unknown encoding: #{encoding_name}"
+      end
+      enc_class.load_encoding(tiktoken_base_dir:)
+    end
+    # @rbs model_name: String -- Name of the model (e.g., "gpt-3.5-turbo")
+    # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
+    # @rbs return: Encoding
+    def encoding_for_model(model_name, tiktoken_base_dir: default_tiktoken_base_dir)
+      encoding_name = MODEL_TO_ENCODING[model_name]
+      raise UnknownModelError, "Unknown model: #{model_name}" unless encoding_name
+      get_encoding(encoding_name, tiktoken_base_dir:)
+    end
+    # @rbs name: String -- Name of the encoding
+    # @rbs ranks: Hash[String, rank] -- Token to rank mapping
+    # @rbs pattern: Regexp
+    # @rbs special_tokens: Hash[String, rank]
+    # @rbs return: Encoding
+    def make_encoding(name:, ranks:, pattern:, special_tokens: {})
+      Encoding.new(
+        name:,
+        ranks:,
+        special_tokens:,
+        pattern:
+      )
+    end
+    # @rbs path: String -- Path to the .tiktoken file
+    # @rbs name: String -- Name of the encoding
+    # @rbs pattern: Regexp
+    # @rbs special_tokens: Hash[String, rank]
+    # @rbs return: Encoding
+    def encoding_from_file(path:, name:, pattern:, special_tokens: {})
+      parser = TiktokenFile.new
+      ranks = parser.load(path)
+      Encoding.new(
+        name:,
+        ranks:,
+        special_tokens:,
+        pattern:
+      )
+    end
+    # @rbs return: Array[String]
+    def list_encoding_names
+      %w[cl100k_base p50k_base p50k_edit r50k_base o200k_base]
+    end
+    # @rbs return: Array[String]
+    def list_model_names
+      MODEL_TO_ENCODING.keys
+    end
+  end
+  private
+  class << self
+    # @rbs return: String
+    def default_tiktoken_base_dir
+      ENV[TIKTOKEN_BASE_DIR_ENV_KEY] || DEFAULT_TIKTOKEN_BASE_DIR
+    end
+  end
+end

data/renovate.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": [
+    "config:recommended"
+  ]
+}

data/sig/generated/riktoken/bpe.rbs ADDED Viewed

@@ -0,0 +1,55 @@
+# Generated from lib/riktoken/bpe.rb with RBS::Inline
+module Riktoken
+  class BPE
+    class TextEncodingError < StandardError
+    end
+    attr_reader encoder: Hash[String, rank]
+    attr_reader decoder: Hash[rank, String]
+    attr_reader special_tokens_encoder: Hash[String, rank]
+    attr_reader special_tokens_decoder: Hash[rank, String]
+    attr_reader regex: Regexp
+    attr_reader special_regex: Regexp
+    # @rbs encoder: Hash[String, rank]
+    # @rbs regex: Regexp
+    # @rbs special_tokens_encoder: Hash[String, rank]
+    # @rbs return: BPE
+    def initialize: (encoder: Hash[String, rank], regex: Regexp, special_tokens_encoder: Hash[String, rank]) -> BPE
+    # @rbs return: Set[String]
+    def special_tokens: () -> Set[String]
+    # Encode given text into tokens using the BPE encoding, allowing for given special tokens.
+    # @rbs text: String
+    # @rbs allowed_special_tokens: Set[String]
+    # @rbs return: tuple[Array[rank], Integer]
+    def encode: (String text, ?allowed_special_tokens: Set[String]) -> tuple[Array[rank], Integer]
+    # Encode given text into tokens using the BPE encoding without considering special tokens.
+    # @rbs text: String
+    # @rbs return: Array[rank]
+    def encode_ordinary: (String text) -> Array[rank]
+    # Encode given text into tokens using the BPE encoding, allowing for all special tokens.
+    # @rbs text: String
+    # @rbs return: tuple[Array[rank], Integer]
+    def encode_with_special_tokens: (String text) -> tuple[Array[rank], Integer]
+    # Decode given tokens back into text encoded as UTF-8.
+    # @rbs tokens: Array[rank]
+    # @rbs return: String
+    def decode: (Array[rank] tokens) -> String
+    # @rbs piece: String
+    # @rbs ranks: Hash[String, rank]
+    # @rbs return: Array[rank]
+    def self.byte_pair_encode: (String piece, Hash[String, rank] ranks) -> Array[rank]
+  end
+end

data/sig/generated/riktoken/encoding.rbs ADDED Viewed

@@ -0,0 +1,34 @@
+# Generated from lib/riktoken/encoding.rb with RBS::Inline
+module Riktoken
+  class Encoding
+    class DisallowedSpecialTokenError < StandardError
+    end
+    class InvalidTokenError < StandardError
+    end
+    attr_reader name: untyped
+    @special_tokens: Hash[String, rank]
+    @bpe: BPE
+    # @rbs name: String
+    # @rbs ranks: Hash[String, rank]
+    # @rbs special_tokens: Hash[String, rank]
+    # @rbs pattern: Regexp
+    # @rbs return: Encoding
+    def initialize: (name: String, ranks: Hash[String, rank], pattern: Regexp, ?special_tokens: Hash[String, rank]) -> Encoding
+    # @rbs text: String
+    # @rbs allowed_special: Set[String]|"all"
+    # @rbs disallowed_special: Set[String]|"all"
+    # @rbs return: Array[rank]
+    def encode: (String text, ?allowed_special: Set[String] | "all", ?disallowed_special: Set[String] | "all") -> Array[rank]
+    # @rbs tokens: Array[rank]
+    # @rbs return: String
+    def decode: (Array[rank] tokens) -> String
+  end
+end

data/sig/generated/riktoken/encodings/cl100k_base.rbs ADDED Viewed

@@ -0,0 +1,15 @@
+# Generated from lib/riktoken/encodings/cl100k_base.rb with RBS::Inline
+module Riktoken
+  module Encodings
+    module Cl100kBase
+      include Riktoken::Encodings
+      ENCODING_NAME: ::String
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
+    end
+  end
+end

data/sig/generated/riktoken/encodings/o200k_base.rbs ADDED Viewed

@@ -0,0 +1,15 @@
+# Generated from lib/riktoken/encodings/o200k_base.rb with RBS::Inline
+module Riktoken
+  module Encodings
+    module O200kBase
+      include Riktoken::Encodings
+      ENCODING_NAME: ::String
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
+    end
+  end
+end

data/sig/generated/riktoken/encodings/p50k_base.rbs ADDED Viewed

@@ -0,0 +1,15 @@
+# Generated from lib/riktoken/encodings/p50k_base.rb with RBS::Inline
+module Riktoken
+  module Encodings
+    module P50kBase
+      include Riktoken::Encodings
+      ENCODING_NAME: ::String
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
+    end
+  end
+end

data/sig/generated/riktoken/encodings/p50k_edit.rbs ADDED Viewed

@@ -0,0 +1,17 @@
+# Generated from lib/riktoken/encodings/p50k_edit.rb with RBS::Inline
+module Riktoken
+  module Encodings
+    module P50kEdit
+      include Riktoken::Encodings
+      ENCODING_NAME: ::String
+      TIKTOKEN_SIGNATURE_NAME: ::String
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
+    end
+  end
+end

data/sig/generated/riktoken/encodings/r50k_base.rbs ADDED Viewed

@@ -0,0 +1,15 @@
+# Generated from lib/riktoken/encodings/r50k_base.rb with RBS::Inline
+module Riktoken
+  module Encodings
+    module R50kBase
+      include Riktoken::Encodings
+      ENCODING_NAME: ::String
+      # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
+      # @rbs return: Riktoken::Encoding
+      def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
+    end
+  end
+end

data/sig/generated/riktoken/encodings.rbs ADDED Viewed

@@ -0,0 +1,18 @@
+# Generated from lib/riktoken/encodings.rb with RBS::Inline
+module Riktoken
+  module Encodings
+    class FileNotFoundError < StandardError
+    end
+    def self.included: (untyped base) -> untyped
+    module ClassMethods
+      # Look for .tiktoken file in common locations
+      # @rbs name: String
+      # @rbs base_dir: String -- a directory to find the tiktoken file
+      # @rbs return: String
+      def find_tiktoken_file: (name: String, base_dir: String) -> String
+    end
+  end
+end

data/sig/generated/riktoken/tiktoken_file.rbs ADDED Viewed

@@ -0,0 +1,15 @@
+# Generated from lib/riktoken/tiktoken_file.rb with RBS::Inline
+module Riktoken
+  class TiktokenFile
+    class ParseError < StandardError
+    end
+    # Parses a .tiktoken file content and returns a hash mapping base64-encoded tokens to their ranks.
+    # @rbs content: String
+    # @rbs return: Hash[String, Integer]
+    def parse: (String content) -> Hash[String, Integer]
+    def load: (untyped path) -> untyped
+  end
+end

data/sig/generated/riktoken/version.rbs ADDED Viewed

@@ -0,0 +1,5 @@
+# Generated from lib/riktoken/version.rb with RBS::Inline
+module Riktoken
+  VERSION: ::String
+end

data/sig/generated/riktoken.rbs ADDED Viewed

@@ -0,0 +1,55 @@
+# Generated from lib/riktoken.rb with RBS::Inline
+module Riktoken
+  type rank = Integer
+  type tuple[T, U] = [ T, U ]
+  class UnknownEncodingError < StandardError
+  end
+  class UnknownModelError < StandardError
+  end
+  MODEL_TO_ENCODING: untyped
+  DEFAULT_TIKTOKEN_BASE_DIR: untyped
+  TIKTOKEN_BASE_DIR_ENV_KEY: ::String
+  # Get the encoding by name (like "cl100k_base").
+  # @rbs encoding_name: String
+  # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
+  # @rbs return: Encoding
+  def self.get_encoding: (String encoding_name, ?tiktoken_base_dir: String) -> Encoding
+  # @rbs model_name: String -- Name of the model (e.g., "gpt-3.5-turbo")
+  # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
+  # @rbs return: Encoding
+  def self.encoding_for_model: (String model_name, ?tiktoken_base_dir: String) -> Encoding
+  # @rbs name: String -- Name of the encoding
+  # @rbs ranks: Hash[String, rank] -- Token to rank mapping
+  # @rbs pattern: Regexp
+  # @rbs special_tokens: Hash[String, rank]
+  # @rbs return: Encoding
+  def self.make_encoding: (name: String, ranks: Hash[String, rank], pattern: Regexp, ?special_tokens: Hash[String, rank]) -> Encoding
+  # @rbs path: String -- Path to the .tiktoken file
+  # @rbs name: String -- Name of the encoding
+  # @rbs pattern: Regexp
+  # @rbs special_tokens: Hash[String, rank]
+  # @rbs return: Encoding
+  def self.encoding_from_file: (path: String, name: String, pattern: Regexp, ?special_tokens: Hash[String, rank]) -> Encoding
+  # @rbs return: Array[String]
+  def self.list_encoding_names: () -> Array[String]
+  # @rbs return: Array[String]
+  def self.list_model_names: () -> Array[String]
+  private
+  # @rbs return: String
+  def self.default_tiktoken_base_dir: () -> String
+end