riktoken 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../encodings"
4
+
5
+ module Riktoken
6
+ module Encodings
7
+ module P50kBase
8
+ include Riktoken::Encodings
9
+
10
+ ENCODING_NAME = "p50k_base"
11
+ private_constant :ENCODING_NAME
12
+
13
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
14
+ # @rbs return: Riktoken::Encoding
15
+ def self.load_encoding(tiktoken_base_dir:)
16
+ ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
17
+ special_tokens = {
18
+ "<|endoftext|>" => 50256
19
+ }
20
+ pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/
21
+
22
+ Riktoken::Encoding.new(
23
+ name: ENCODING_NAME,
24
+ ranks: ranks,
25
+ special_tokens: special_tokens,
26
+ pattern: pattern
27
+ )
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../encodings"
4
+
5
+ module Riktoken
6
+ module Encodings
7
+ module P50kEdit
8
+ include Riktoken::Encodings
9
+
10
+ ENCODING_NAME = "p50k_edit"
11
+ TIKTOKEN_SIGNATURE_NAME = "p50k_base"
12
+ private_constant :ENCODING_NAME
13
+
14
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
15
+ # @rbs return: Riktoken::Encoding
16
+ def self.load_encoding(tiktoken_base_dir:)
17
+ ranks = TiktokenFile.new.load(find_tiktoken_file(name: TIKTOKEN_SIGNATURE_NAME, base_dir: tiktoken_base_dir))
18
+ special_tokens = {
19
+ "<|endoftext|>" => 50256,
20
+ "<|fim_prefix|>" => 50281,
21
+ "<|fim_middle|>" => 50282,
22
+ "<|fim_suffix|>" => 50283
23
+ }
24
+ pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/
25
+
26
+ Riktoken::Encoding.new(
27
+ name: ENCODING_NAME,
28
+ ranks: ranks,
29
+ special_tokens: special_tokens,
30
+ pattern: pattern
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../encodings"
4
+
5
+ module Riktoken
6
+ module Encodings
7
+ module R50kBase
8
+ include Riktoken::Encodings
9
+
10
+ ENCODING_NAME = "r50k_base"
11
+ private_constant :ENCODING_NAME
12
+
13
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
14
+ # @rbs return: Riktoken::Encoding
15
+ def self.load_encoding(tiktoken_base_dir:)
16
+ ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
17
+ special_tokens = {
18
+ "<|endoftext|>" => 50256
19
+ }
20
+ pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/
21
+
22
+ Riktoken::Encoding.new(
23
+ name: ENCODING_NAME,
24
+ ranks: ranks,
25
+ special_tokens: special_tokens,
26
+ pattern: pattern
27
+ )
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Riktoken
4
+ module Encodings
5
+ class FileNotFoundError < StandardError; end
6
+
7
+ def self.included(base)
8
+ base.extend(ClassMethods)
9
+
10
+ base.private_class_method :find_tiktoken_file
11
+ end
12
+
13
+ module ClassMethods
14
+ # Look for .tiktoken file in common locations
15
+ # @rbs name: String
16
+ # @rbs base_dir: String -- a directory to find the tiktoken file
17
+ # @rbs return: String
18
+ def find_tiktoken_file(name:, base_dir:)
19
+ path = File.join(base_dir, "#{name}.tiktoken")
20
+ if File.exist?(path)
21
+ path
22
+ else
23
+ raise FileNotFoundError, "tiktoken file not found: #{path}"
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "base64"
4
+
5
+ module Riktoken
6
+ class TiktokenFile
7
+ class ParseError < StandardError; end
8
+
9
+ # Parses a .tiktoken file content and returns a hash mapping base64-encoded tokens to their ranks.
10
+ # @rbs content: String
11
+ # @rbs return: Hash[String, Integer]
12
+ def parse(content)
13
+ ranks = {}
14
+
15
+ content.each_line do |line|
16
+ line = line.strip
17
+
18
+ next if line.empty? || line.start_with?("#")
19
+
20
+ parts = line.split(/\s+/)
21
+ if parts.length != 2
22
+ raise ParseError, "Invalid line format: #{line}"
23
+ end
24
+
25
+ begin
26
+ token = Base64.strict_decode64(parts[0])
27
+ rank = Integer(parts[1])
28
+ ranks[token] = rank
29
+ rescue ArgumentError => e
30
+ raise ParseError, "Failed to parse line: #{line} - #{e.message}"
31
+ end
32
+ end
33
+
34
+ ranks
35
+ end
36
+
37
+ def load(path)
38
+ content = File.read(path, encoding: "UTF-8")
39
+ parse(content)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Riktoken
4
+ VERSION = "0.0.1"
5
+ end
data/lib/riktoken.rb ADDED
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "riktoken/version"
4
+ require_relative "riktoken/encoding"
5
+ require_relative "riktoken/tiktoken_file"
6
+ require_relative "riktoken/encodings/cl100k_base"
7
+ require_relative "riktoken/encodings/p50k_base"
8
+ require_relative "riktoken/encodings/p50k_edit"
9
+ require_relative "riktoken/encodings/r50k_base"
10
+ require_relative "riktoken/encodings/o200k_base"
11
+
12
+ module Riktoken
13
+ # @rbs!
14
+ # type rank = Integer
15
+ # type tuple[T, U] = [T, U]
16
+
17
+ class UnknownEncodingError < StandardError; end
18
+
19
+ class UnknownModelError < StandardError; end
20
+ MODEL_TO_ENCODING = {
21
+ # GPT-4 models
22
+ "gpt-4" => "cl100k_base",
23
+ "gpt-4-0314" => "cl100k_base",
24
+ "gpt-4-0613" => "cl100k_base",
25
+ "gpt-4-32k" => "cl100k_base",
26
+ "gpt-4-32k-0314" => "cl100k_base",
27
+ "gpt-4-32k-0613" => "cl100k_base",
28
+
29
+ # GPT-3.5 models
30
+ "gpt-3.5-turbo" => "cl100k_base",
31
+ "gpt-3.5-turbo-0301" => "cl100k_base",
32
+ "gpt-3.5-turbo-0613" => "cl100k_base",
33
+ "gpt-3.5-turbo-16k" => "cl100k_base",
34
+ "gpt-3.5-turbo-16k-0613" => "cl100k_base",
35
+
36
+ # Legacy models
37
+ "text-davinci-003" => "p50k_base",
38
+ "text-davinci-002" => "p50k_base",
39
+ "text-davinci-001" => "r50k_base",
40
+ "text-curie-001" => "r50k_base",
41
+ "text-babbage-001" => "r50k_base",
42
+ "text-ada-001" => "r50k_base",
43
+ "davinci" => "r50k_base",
44
+ "curie" => "r50k_base",
45
+ "babbage" => "r50k_base",
46
+ "ada" => "r50k_base",
47
+
48
+ # Code models
49
+ "code-davinci-002" => "p50k_base",
50
+ "code-davinci-001" => "p50k_base",
51
+ "code-cushman-002" => "p50k_base",
52
+ "code-cushman-001" => "p50k_base",
53
+ "davinci-codex" => "p50k_base",
54
+ "cushman-codex" => "p50k_base",
55
+
56
+ # Edit models
57
+ "text-davinci-edit-001" => "p50k_edit",
58
+ "code-davinci-edit-001" => "p50k_edit",
59
+
60
+ # Embeddings
61
+ "text-embedding-ada-002" => "cl100k_base",
62
+
63
+ # GPT-4o models
64
+ "gpt-4o" => "o200k_base",
65
+ "gpt-4o-mini" => "o200k_base"
66
+ }.freeze
67
+ DEFAULT_TIKTOKEN_BASE_DIR = File.join(Dir.home, ".riktoken").freeze
68
+ TIKTOKEN_BASE_DIR_ENV_KEY = "TIKTOKEN_BASE_DIR"
69
+ private_constant :MODEL_TO_ENCODING, :DEFAULT_TIKTOKEN_BASE_DIR, :TIKTOKEN_BASE_DIR_ENV_KEY
70
+
71
+ class << self
72
+ # Get the encoding by name (like "cl100k_base").
73
+ # @rbs encoding_name: String
74
+ # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
75
+ # @rbs return: Encoding
76
+ def get_encoding(encoding_name, tiktoken_base_dir: default_tiktoken_base_dir)
77
+ enc_class = case encoding_name
78
+ when "cl100k_base"
79
+ Encodings::Cl100kBase
80
+ when "p50k_base"
81
+ Encodings::P50kBase
82
+ when "p50k_edit"
83
+ Encodings::P50kEdit
84
+ when "r50k_base"
85
+ Encodings::R50kBase
86
+ when "o200k_base"
87
+ Encodings::O200kBase
88
+ else
89
+ raise UnknownEncodingError, "Unknown encoding: #{encoding_name}"
90
+ end
91
+
92
+ enc_class.load_encoding(tiktoken_base_dir:)
93
+ end
94
+
95
+ # @rbs model_name: String -- Name of the model (e.g., "gpt-3.5-turbo")
96
+ # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
97
+ # @rbs return: Encoding
98
+ def encoding_for_model(model_name, tiktoken_base_dir: default_tiktoken_base_dir)
99
+ encoding_name = MODEL_TO_ENCODING[model_name]
100
+ raise UnknownModelError, "Unknown model: #{model_name}" unless encoding_name
101
+
102
+ get_encoding(encoding_name, tiktoken_base_dir:)
103
+ end
104
+
105
+ # @rbs name: String -- Name of the encoding
106
+ # @rbs ranks: Hash[String, rank] -- Token to rank mapping
107
+ # @rbs pattern: Regexp
108
+ # @rbs special_tokens: Hash[String, rank]
109
+ # @rbs return: Encoding
110
+ def make_encoding(name:, ranks:, pattern:, special_tokens: {})
111
+ Encoding.new(
112
+ name:,
113
+ ranks:,
114
+ special_tokens:,
115
+ pattern:
116
+ )
117
+ end
118
+
119
+ # @rbs path: String -- Path to the .tiktoken file
120
+ # @rbs name: String -- Name of the encoding
121
+ # @rbs pattern: Regexp
122
+ # @rbs special_tokens: Hash[String, rank]
123
+ # @rbs return: Encoding
124
+ def encoding_from_file(path:, name:, pattern:, special_tokens: {})
125
+ parser = TiktokenFile.new
126
+ ranks = parser.load(path)
127
+
128
+ Encoding.new(
129
+ name:,
130
+ ranks:,
131
+ special_tokens:,
132
+ pattern:
133
+ )
134
+ end
135
+
136
+ # @rbs return: Array[String]
137
+ def list_encoding_names
138
+ %w[cl100k_base p50k_base p50k_edit r50k_base o200k_base]
139
+ end
140
+
141
+ # @rbs return: Array[String]
142
+ def list_model_names
143
+ MODEL_TO_ENCODING.keys
144
+ end
145
+ end
146
+
147
+ private
148
+
149
+ class << self
150
+ # @rbs return: String
151
+ def default_tiktoken_base_dir
152
+ ENV[TIKTOKEN_BASE_DIR_ENV_KEY] || DEFAULT_TIKTOKEN_BASE_DIR
153
+ end
154
+ end
155
+ end
data/renovate.json ADDED
@@ -0,0 +1,6 @@
1
+ {
2
+ "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3
+ "extends": [
4
+ "config:recommended"
5
+ ]
6
+ }
@@ -0,0 +1,55 @@
1
+ # Generated from lib/riktoken/bpe.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ class BPE
5
+ class TextEncodingError < StandardError
6
+ end
7
+
8
+ attr_reader encoder: Hash[String, rank]
9
+
10
+ attr_reader decoder: Hash[rank, String]
11
+
12
+ attr_reader special_tokens_encoder: Hash[String, rank]
13
+
14
+ attr_reader special_tokens_decoder: Hash[rank, String]
15
+
16
+ attr_reader regex: Regexp
17
+
18
+ attr_reader special_regex: Regexp
19
+
20
+ # @rbs encoder: Hash[String, rank]
21
+ # @rbs regex: Regexp
22
+ # @rbs special_tokens_encoder: Hash[String, rank]
23
+ # @rbs return: BPE
24
+ def initialize: (encoder: Hash[String, rank], regex: Regexp, special_tokens_encoder: Hash[String, rank]) -> BPE
25
+
26
+ # @rbs return: Set[String]
27
+ def special_tokens: () -> Set[String]
28
+
29
+ # Encode given text into tokens using the BPE encoding, allowing for given special tokens.
30
+ # @rbs text: String
31
+ # @rbs allowed_special_tokens: Set[String]
32
+ # @rbs return: tuple[Array[rank], Integer]
33
+ def encode: (String text, ?allowed_special_tokens: Set[String]) -> tuple[Array[rank], Integer]
34
+
35
+ # Encode given text into tokens using the BPE encoding without considering special tokens.
36
+ # @rbs text: String
37
+ # @rbs return: Array[rank]
38
+ def encode_ordinary: (String text) -> Array[rank]
39
+
40
+ # Encode given text into tokens using the BPE encoding, allowing for all special tokens.
41
+ # @rbs text: String
42
+ # @rbs return: tuple[Array[rank], Integer]
43
+ def encode_with_special_tokens: (String text) -> tuple[Array[rank], Integer]
44
+
45
+ # Decode given tokens back into text encoded as UTF-8.
46
+ # @rbs tokens: Array[rank]
47
+ # @rbs return: String
48
+ def decode: (Array[rank] tokens) -> String
49
+
50
+ # @rbs piece: String
51
+ # @rbs ranks: Hash[String, rank]
52
+ # @rbs return: Array[rank]
53
+ def self.byte_pair_encode: (String piece, Hash[String, rank] ranks) -> Array[rank]
54
+ end
55
+ end
@@ -0,0 +1,34 @@
1
+ # Generated from lib/riktoken/encoding.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ class Encoding
5
+ class DisallowedSpecialTokenError < StandardError
6
+ end
7
+
8
+ class InvalidTokenError < StandardError
9
+ end
10
+
11
+ attr_reader name: untyped
12
+
13
+ @special_tokens: Hash[String, rank]
14
+
15
+ @bpe: BPE
16
+
17
+ # @rbs name: String
18
+ # @rbs ranks: Hash[String, rank]
19
+ # @rbs special_tokens: Hash[String, rank]
20
+ # @rbs pattern: Regexp
21
+ # @rbs return: Encoding
22
+ def initialize: (name: String, ranks: Hash[String, rank], pattern: Regexp, ?special_tokens: Hash[String, rank]) -> Encoding
23
+
24
+ # @rbs text: String
25
+ # @rbs allowed_special: Set[String]|"all"
26
+ # @rbs disallowed_special: Set[String]|"all"
27
+ # @rbs return: Array[rank]
28
+ def encode: (String text, ?allowed_special: Set[String] | "all", ?disallowed_special: Set[String] | "all") -> Array[rank]
29
+
30
+ # @rbs tokens: Array[rank]
31
+ # @rbs return: String
32
+ def decode: (Array[rank] tokens) -> String
33
+ end
34
+ end
@@ -0,0 +1,15 @@
1
+ # Generated from lib/riktoken/encodings/cl100k_base.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ module Encodings
5
+ module Cl100kBase
6
+ include Riktoken::Encodings
7
+
8
+ ENCODING_NAME: ::String
9
+
10
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
11
+ # @rbs return: Riktoken::Encoding
12
+ def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ # Generated from lib/riktoken/encodings/o200k_base.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ module Encodings
5
+ module O200kBase
6
+ include Riktoken::Encodings
7
+
8
+ ENCODING_NAME: ::String
9
+
10
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
11
+ # @rbs return: Riktoken::Encoding
12
+ def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ # Generated from lib/riktoken/encodings/p50k_base.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ module Encodings
5
+ module P50kBase
6
+ include Riktoken::Encodings
7
+
8
+ ENCODING_NAME: ::String
9
+
10
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
11
+ # @rbs return: Riktoken::Encoding
12
+ def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ # Generated from lib/riktoken/encodings/p50k_edit.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ module Encodings
5
+ module P50kEdit
6
+ include Riktoken::Encodings
7
+
8
+ ENCODING_NAME: ::String
9
+
10
+ TIKTOKEN_SIGNATURE_NAME: ::String
11
+
12
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
13
+ # @rbs return: Riktoken::Encoding
14
+ def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ # Generated from lib/riktoken/encodings/r50k_base.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ module Encodings
5
+ module R50kBase
6
+ include Riktoken::Encodings
7
+
8
+ ENCODING_NAME: ::String
9
+
10
+ # @rbs tiktoken_base_dir: String -- the directory where tiktoken files are stored
11
+ # @rbs return: Riktoken::Encoding
12
+ def self.load_encoding: (tiktoken_base_dir: String) -> Riktoken::Encoding
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,18 @@
1
+ # Generated from lib/riktoken/encodings.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ module Encodings
5
+ class FileNotFoundError < StandardError
6
+ end
7
+
8
+ def self.included: (untyped base) -> untyped
9
+
10
+ module ClassMethods
11
+ # Look for .tiktoken file in common locations
12
+ # @rbs name: String
13
+ # @rbs base_dir: String -- a directory to find the tiktoken file
14
+ # @rbs return: String
15
+ def find_tiktoken_file: (name: String, base_dir: String) -> String
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ # Generated from lib/riktoken/tiktoken_file.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ class TiktokenFile
5
+ class ParseError < StandardError
6
+ end
7
+
8
+ # Parses a .tiktoken file content and returns a hash mapping base64-encoded tokens to their ranks.
9
+ # @rbs content: String
10
+ # @rbs return: Hash[String, Integer]
11
+ def parse: (String content) -> Hash[String, Integer]
12
+
13
+ def load: (untyped path) -> untyped
14
+ end
15
+ end
@@ -0,0 +1,5 @@
1
+ # Generated from lib/riktoken/version.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ VERSION: ::String
5
+ end
@@ -0,0 +1,55 @@
1
+ # Generated from lib/riktoken.rb with RBS::Inline
2
+
3
+ module Riktoken
4
+ type rank = Integer
5
+
6
+ type tuple[T, U] = [ T, U ]
7
+
8
+ class UnknownEncodingError < StandardError
9
+ end
10
+
11
+ class UnknownModelError < StandardError
12
+ end
13
+
14
+ MODEL_TO_ENCODING: untyped
15
+
16
+ DEFAULT_TIKTOKEN_BASE_DIR: untyped
17
+
18
+ TIKTOKEN_BASE_DIR_ENV_KEY: ::String
19
+
20
+ # Get the encoding by name (like "cl100k_base").
21
+ # @rbs encoding_name: String
22
+ # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
23
+ # @rbs return: Encoding
24
+ def self.get_encoding: (String encoding_name, ?tiktoken_base_dir: String) -> Encoding
25
+
26
+ # @rbs model_name: String -- Name of the model (e.g., "gpt-3.5-turbo")
27
+ # @rbs tiktoken_base_dir: String -- Base directory for tiktoken files
28
+ # @rbs return: Encoding
29
+ def self.encoding_for_model: (String model_name, ?tiktoken_base_dir: String) -> Encoding
30
+
31
+ # @rbs name: String -- Name of the encoding
32
+ # @rbs ranks: Hash[String, rank] -- Token to rank mapping
33
+ # @rbs pattern: Regexp
34
+ # @rbs special_tokens: Hash[String, rank]
35
+ # @rbs return: Encoding
36
+ def self.make_encoding: (name: String, ranks: Hash[String, rank], pattern: Regexp, ?special_tokens: Hash[String, rank]) -> Encoding
37
+
38
+ # @rbs path: String -- Path to the .tiktoken file
39
+ # @rbs name: String -- Name of the encoding
40
+ # @rbs pattern: Regexp
41
+ # @rbs special_tokens: Hash[String, rank]
42
+ # @rbs return: Encoding
43
+ def self.encoding_from_file: (path: String, name: String, pattern: Regexp, ?special_tokens: Hash[String, rank]) -> Encoding
44
+
45
+ # @rbs return: Array[String]
46
+ def self.list_encoding_names: () -> Array[String]
47
+
48
+ # @rbs return: Array[String]
49
+ def self.list_model_names: () -> Array[String]
50
+
51
+ private
52
+
53
+ # @rbs return: String
54
+ def self.default_tiktoken_base_dir: () -> String
55
+ end