tiktoken_ruby 0.0.15-aarch64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 8593709eb39d7a795a0a4bb8f3956c112fe20e4b098e32d18ac0d23e91f57bee
4
+ data.tar.gz: 43be73ed68f5e43384430940c860c16ef558d2ccee8ee96bdf3982a9358ad50e
5
+ SHA512:
6
+ metadata.gz: '0562749ed6720dca83a1f46aed7b53d4724c42567342f3ae59978cce589ddb4647e9f6a8a80d3d60e5fcd78529741bb8e970ada627eb5bb650dfbd3705fb4aa6'
7
+ data.tar.gz: 85006fe97f6ea2ab88a9ce38f6b0cc931ae7faf2e8983d2c36530a31b7d5270666145411c5dd2078df3931c27b652d0ea635bb730cbafe42b5556b95d264e1f9
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.standard.yml ADDED
@@ -0,0 +1,3 @@
1
+ # For available configuration options, see:
2
+ # https://github.com/testdouble/standard
3
+ ruby_version: 2.6
@@ -0,0 +1,11 @@
1
+ {
2
+ "[ruby]": {
3
+ "editor.defaultFormatter": "Shopify.ruby-lsp"
4
+ },
5
+ "[markdown]": {
6
+ "editor.defaultFormatter": "esbenp.prettier-vscode"
7
+ },
8
+ "[github-actions-workflow]": {
9
+ "editor.defaultFormatter": "redhat.vscode-yaml"
10
+ }
11
+ }
data/CHANGELOG.md ADDED
@@ -0,0 +1,35 @@
1
+ # [v0.0.15] - 06-01-2026
2
+ ## What's Changed
3
+ * No longer true by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/96
4
+ * Test build on version change by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/97
5
+ * Bump actions/cache from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/98
6
+ * Bump rb-sys from 0.9.117 to 0.9.123 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/99
7
+ * Bump the bundler-dependencies group with 2 updates by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/100
8
+ * Add Ruby 4.0 to build matrix by @chubchenko in https://github.com/IAPark/tiktoken_ruby/pull/101
9
+ * Support Ruby 4 by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/102
10
+
11
+ ## New Contributors
12
+ * @chubchenko made their first contribution in https://github.com/IAPark/tiktoken_ruby/pull/101
13
+
14
+ **Full Changelog**: https://github.com/IAPark/tiktoken_ruby/compare/v0.0.14.1...v0.0.15
15
+ # [v0.0.14.1] - 20-12-2025
16
+ ## What's Changed
17
+ * Cut v0.0.12 by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/78
18
+ * Bump magnus from 0.8.0 to 0.8.1 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/83
19
+ * Bump actions/checkout from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/82
20
+ * Bump standard from 1.50.0 to 1.51.1 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/81
21
+ * Bump actions/upload-artifact from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/84
22
+ * Bump magnus from 0.8.1 to 0.8.2 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/85
23
+ * Bump the bundler-dependencies group with 2 updates by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/86
24
+ * Support by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/87
25
+ * Bump actions/checkout from 5 to 6 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/89
26
+ * Bump standard from 1.51.1 to 1.52.0 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/88
27
+ * release GVL while encoding / decoding tokens by @tenderworks in https://github.com/IAPark/tiktoken_ruby/pull/90
28
+ * Drop Ruby 3.1 support; automate release process by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/92
29
+ * Rewrite history by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/93
30
+ * Force workflow rebuild by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/94
31
+
32
+ ## New Contributors
33
+ * @tenderworks made their first contribution in https://github.com/IAPark/tiktoken_ruby/pull/90
34
+
35
+ **Full Changelog**: https://github.com/IAPark/tiktoken_ruby/compare/v0.0.12...v0.0.14.1
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ gem "rake"
8
+ gem "rake-compiler"
9
+ gem "rspec"
10
+ gem "standard"
11
+ gem "yard-doctest"
12
+ gem "racc"
data/Gemfile.lock ADDED
@@ -0,0 +1,99 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tiktoken_ruby (0.0.15)
5
+ rb_sys (~> 0.9)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ ast (2.4.3)
11
+ diff-lcs (1.6.2)
12
+ json (2.16.0)
13
+ language_server-protocol (3.17.0.5)
14
+ lint_roller (1.1.0)
15
+ minitest (6.0.1)
16
+ prism (~> 1.5)
17
+ parallel (1.27.0)
18
+ parser (3.3.10.0)
19
+ ast (~> 2.4.1)
20
+ racc
21
+ prism (1.6.0)
22
+ racc (1.8.1)
23
+ rainbow (3.1.1)
24
+ rake (13.3.1)
25
+ rake-compiler (1.3.1)
26
+ rake
27
+ rake-compiler-dock (1.10.0)
28
+ rb_sys (0.9.123)
29
+ rake-compiler-dock (= 1.10.0)
30
+ regexp_parser (2.11.3)
31
+ rspec (3.13.2)
32
+ rspec-core (~> 3.13.0)
33
+ rspec-expectations (~> 3.13.0)
34
+ rspec-mocks (~> 3.13.0)
35
+ rspec-core (3.13.6)
36
+ rspec-support (~> 3.13.0)
37
+ rspec-expectations (3.13.5)
38
+ diff-lcs (>= 1.2.0, < 2.0)
39
+ rspec-support (~> 3.13.0)
40
+ rspec-mocks (3.13.7)
41
+ diff-lcs (>= 1.2.0, < 2.0)
42
+ rspec-support (~> 3.13.0)
43
+ rspec-support (3.13.6)
44
+ rubocop (1.81.7)
45
+ json (~> 2.3)
46
+ language_server-protocol (~> 3.17.0.2)
47
+ lint_roller (~> 1.1.0)
48
+ parallel (~> 1.10)
49
+ parser (>= 3.3.0.2)
50
+ rainbow (>= 2.2.2, < 4.0)
51
+ regexp_parser (>= 2.9.3, < 3.0)
52
+ rubocop-ast (>= 1.47.1, < 2.0)
53
+ ruby-progressbar (~> 1.7)
54
+ unicode-display_width (>= 2.4.0, < 4.0)
55
+ rubocop-ast (1.48.0)
56
+ parser (>= 3.3.7.2)
57
+ prism (~> 1.4)
58
+ rubocop-performance (1.25.0)
59
+ lint_roller (~> 1.1)
60
+ rubocop (>= 1.75.0, < 2.0)
61
+ rubocop-ast (>= 1.38.0, < 2.0)
62
+ ruby-progressbar (1.13.0)
63
+ standard (1.52.0)
64
+ language_server-protocol (~> 3.17.0.2)
65
+ lint_roller (~> 1.0)
66
+ rubocop (~> 1.81.7)
67
+ standard-custom (~> 1.0.0)
68
+ standard-performance (~> 1.8)
69
+ standard-custom (1.0.2)
70
+ lint_roller (~> 1.0)
71
+ rubocop (~> 1.50)
72
+ standard-performance (1.8.0)
73
+ lint_roller (~> 1.1)
74
+ rubocop-performance (~> 1.25.0)
75
+ unicode-display_width (3.2.0)
76
+ unicode-emoji (~> 4.1)
77
+ unicode-emoji (4.2.0)
78
+ yard (0.9.37)
79
+ yard-doctest (0.1.17)
80
+ minitest
81
+ yard
82
+
83
+ PLATFORMS
84
+ arm64-darwin-22
85
+ ruby
86
+ x86_64-darwin-22
87
+ x86_64-linux
88
+
89
+ DEPENDENCIES
90
+ racc
91
+ rake
92
+ rake-compiler
93
+ rspec
94
+ standard
95
+ tiktoken_ruby!
96
+ yard-doctest
97
+
98
+ BUNDLED WITH
99
+ 2.6.9
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 IAPark
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,94 @@
1
+ [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
2
+
3
+ # tiktoken_ruby
4
+
5
+ [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
6
+ This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
7
+
8
+ ## Installation
9
+
10
+ Install the gem and add to the application's Gemfile by executing:
11
+
12
+ $ bundle add tiktoken_ruby
13
+
14
+ If bundler is not being used to manage dependencies, install the gem by executing:
15
+
16
+ $ gem install tiktoken_ruby
17
+
18
+ ## Usage
19
+
20
+ Usage should be very similar to the python library. Here's a simple example
21
+
22
+ Encode and decode text
23
+
24
+ ```ruby
25
+ require 'tiktoken_ruby'
26
+ enc = Tiktoken.get_encoding("cl100k_base")
27
+ enc.decode(enc.encode("hello world")) #=> "hello world"
28
+ ```
29
+
30
+ Encoders can also be retrieved by model name
31
+
32
+ ```ruby
33
+ require 'tiktoken_ruby'
34
+
35
+ enc = Tiktoken.encoding_for_model("gpt-4")
36
+ enc.encode("hello world").length #=> 2
37
+ ```
38
+
39
+ ### Encoding methods
40
+
41
+ There are three methods for encoding text:
42
+
43
+ - `encode_ordinary(text)` - Encodes text, always treating special tokens as ordinary text
44
+ - `encode(text, allowed_special: [])` - Encodes text, treating special tokens as text unless listed in `allowed_special`
45
+ - `encode_with_special_tokens(text)` - Encodes text, recognizing and parsing all special tokens
46
+
47
+ **Special tokens** are control sequences used by OpenAI models, such as `<|endoftext|>`, `<|fim_prefix|>`, `<|fim_middle|>`, and `<|fim_suffix|>`. The encoding methods differ in how they handle these sequences:
48
+
49
+ ```ruby
50
+ enc = Tiktoken.get_encoding("cl100k_base")
51
+ text = "Hello<|endoftext|>World"
52
+
53
+ # encode_ordinary: treats <|endoftext|> as literal characters (9 tokens)
54
+ enc.encode_ordinary(text)
55
+ #=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
56
+
57
+ # encode: same as encode_ordinary by default
58
+ enc.encode(text)
59
+ #=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
60
+
61
+ # encode with allowed_special: recognizes the specified special token (3 tokens)
62
+ enc.encode(text, allowed_special: ["<|endoftext|>"])
63
+ #=> [9906, 100257, 10343]
64
+
65
+ # encode_with_special_tokens: recognizes ALL special tokens (3 tokens)
66
+ enc.encode_with_special_tokens(text)
67
+ #=> [9906, 100257, 10343]
68
+ ```
69
+
70
+ All methods round-trip correctly through `decode`.
71
+
72
+ ## Development
73
+
74
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
75
+
76
+ To install this gem onto your local machine, run `bundle exec rake install`.
77
+
78
+ ## Contributing
79
+
80
+ Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
81
+
82
+ To get started with development:
83
+
84
+ ```sh
85
+ git clone https://github.com/IAPark/tiktoken_ruby.git
86
+ cd tiktoken_ruby
87
+ bundle install
88
+ bundle exec rake compile
89
+ bundle exec rake spec
90
+ ```
91
+
92
+ ## License
93
+
94
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+ require "standard/rake"
6
+ require "rake/extensiontask"
7
+ require "rb_sys/extensiontask"
8
+
9
+ GEMSPEC = Gem::Specification.load("tiktoken_ruby.gemspec")
10
+
11
+ RbSys::ExtensionTask.new("tiktoken_ruby", GEMSPEC) do |ext|
12
+ ext.lib_dir = "lib/tiktoken_ruby"
13
+ end
14
+
15
+ RSpec::Core::RakeTask.new(:spec)
16
+
17
+ task :native, [:platform] do |_t, platform:|
18
+ sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
19
+ end
20
+
21
+ task build: :compile
22
+
23
+ task default: %i[compile spec standard]
24
+
25
+ # Packaging default (non-precompiled) gem
26
+ require "rubygems/package_task"
27
+ gem_path = Gem::PackageTask.new(GEMSPEC).define
28
+ desc "Package the Ruby gem"
29
+ task "package" => [gem_path]
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require "lib/tiktoken_ruby"
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Tiktoken::Encoding
4
+ CACHE_MUTEX = Mutex.new
5
+
6
+ attr_reader :name
7
+
8
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
9
+ # @param encoding [Symbol] The name of the encoding to load
10
+ # @return [Tiktoken::Encoding] The encoding instance
11
+ def self.for_name(encoding)
12
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
13
+ end
14
+
15
+ # This returns a Tiktoken::Encoding instance for the requested encoding
16
+ # It will reuse an existing encoding if it's already been loaded
17
+ # @param encoding [Symbol] The name of the encoding to load
18
+ # @return [Tiktoken::Encoding] The encoding instance
19
+ def self.for_name_cached(encoding)
20
+ CACHE_MUTEX.synchronize do
21
+ @encodings ||= {}
22
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
23
+ end
24
+ end
25
+
26
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
27
+ # basically it's unescaped
28
+ # @param text [String] The text to encode
29
+ # @return [Array<Integer>] The encoded tokens
30
+ def encode_ordinary(text)
31
+ @ext_base_bpe.encode_ordinary(text)
32
+ end
33
+
34
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
35
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
36
+ # @param text [String] The text to encode
37
+ # @param allowed_special [Array<String>] An array of special tokens to allow
38
+ # @return [Array<Integer>] The encoded tokens
39
+ def encode(text, allowed_special: [])
40
+ @ext_base_bpe.encode(text, allowed_special)
41
+ end
42
+
43
+ # Encodes the text as a list of integer tokens, including special tokens.
44
+ # @param text [String] The text to encode
45
+ # @return [Array<Integer>] The encoded tokens
46
+ def encode_with_special_tokens(text)
47
+ @ext_base_bpe.encode_with_special_tokens(text)
48
+ end
49
+
50
+ # Decodes the tokens back into text
51
+ # @param tokens [Array<Integer>] The tokens to decode
52
+ # @return [String] The decoded text
53
+ def decode(tokens)
54
+ @ext_base_bpe.decode(tokens)
55
+ end
56
+
57
+ private
58
+
59
+ def initialize(ext_base_bpe, name)
60
+ @ext_base_bpe = ext_base_bpe
61
+ @name = name
62
+ end
63
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tiktoken
4
+ VERSION = "0.0.15"
5
+ end
@@ -0,0 +1,156 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "tiktoken_ruby/version"
4
+ require_relative "tiktoken_ruby/encoding"
5
+
6
+ begin
7
+ RUBY_VERSION =~ /(\d+\.\d+)/
8
+ require_relative "tiktoken_ruby/#{$1}/tiktoken_ruby"
9
+ rescue LoadError
10
+ require_relative "tiktoken_ruby/tiktoken_ruby"
11
+ end
12
+
13
+ module Tiktoken
14
+ class << self
15
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
16
+ # it will reuse the instance of that type that was previous loaded
17
+ # @param name [Symbol|String] The name of the encoding to load
18
+ # @return [Tiktoken::Encoding] The encoding instance
19
+ # @example Encode and decode text
20
+ # enc = Tiktoken.get_encoding("cl100k_base")
21
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
22
+ def get_encoding(name)
23
+ name = name.to_sym
24
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
25
+
26
+ Tiktoken::Encoding.for_name_cached(name)
27
+ end
28
+
29
+ # Gets the encoding for an OpenAI model
30
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
31
+ # @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
32
+ # @example Count tokens for text
33
+ # enc = Tiktoken.encoding_for_model("gpt-4")
34
+ # enc.encode("hello world").length #=> 2
35
+ def encoding_for_model(model_name)
36
+ if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
37
+ return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
38
+ end
39
+
40
+ _prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
41
+ model_name.start_with?(prefix.to_s)
42
+ end
43
+
44
+ if encoding
45
+ get_encoding(encoding)
46
+ end
47
+ end
48
+
49
+ # Lists all the encodings that are supported
50
+ # @return [Array<Symbol>] The list of supported encodings
51
+ def list_encoding_names
52
+ SUPPORTED_ENCODINGS
53
+ end
54
+
55
+ # Lists all the models that are supported
56
+ # @return [Array<Symbol>] The list of supported models
57
+ def list_model_names
58
+ MODEL_TO_ENCODING_NAME.keys
59
+ end
60
+
61
+ private
62
+
63
+ SUPPORTED_ENCODINGS = [
64
+ :r50k_base,
65
+ :p50k_base,
66
+ :p50k_edit,
67
+ :cl100k_base,
68
+ :o200k_base,
69
+ :o200k_harmony
70
+ ]
71
+
72
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
73
+ # that is also MIT licensed but by OpenAI;
74
+ # https://github.com/Congyuwang/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs#L50
75
+ # is the source of the mapping for the Rust library
76
+ MODEL_TO_ENCODING_NAME = {
77
+ # reasoning
78
+ o1: "o200k_base",
79
+ o3: "o200k_base",
80
+ "o4-mini": "o200k_base",
81
+ # chat
82
+ "gpt-4.1": "o200k_base",
83
+ "chatgpt-4o": "o200k_base",
84
+ "gpt-4o": "o200k_base",
85
+ "gpt-4": "cl100k_base",
86
+ "gpt-3.5-turbo": "cl100k_base",
87
+ "gpt-3.5": "cl100k_base", # Common shorthand
88
+ "gpt-35-turbo": "cl100k_base", # Azure deployment name
89
+ # base
90
+ "davinci-002": "cl100k_base",
91
+ "babbage-002": "cl100k_base",
92
+ # embeddings
93
+ "text-embedding-ada-002": "cl100k_base",
94
+ "text-embedding-3-small": "cl100k_base",
95
+ "text-embedding-3-large": "cl100k_base",
96
+ # DEPRECATED MODELS
97
+ # text (DEPRECATED)
98
+ "text-davinci-003": "p50k_base",
99
+ "text-davinci-002": "p50k_base",
100
+ "text-davinci-001": "r50k_base",
101
+ "text-curie-001": "r50k_base",
102
+ "text-babbage-001": "r50k_base",
103
+ "text-ada-001": "r50k_base",
104
+ davinci: "r50k_base",
105
+ curie: "r50k_base",
106
+ babbage: "r50k_base",
107
+ ada: "r50k_base",
108
+ # code (DEPRECATED)
109
+ "code-davinci-002": "p50k_base",
110
+ "code-davinci-001": "p50k_base",
111
+ "code-cushman-002": "p50k_base",
112
+ "code-cushman-001": "p50k_base",
113
+ "davinci-codex": "p50k_base",
114
+ "cushman-codex": "p50k_base",
115
+ # edit (DEPRECATED)
116
+ "text-davinci-edit-001": "p50k_edit",
117
+ "code-davinci-edit-001": "p50k_edit",
118
+ # old embeddings (DEPRECATED)
119
+ "text-similarity-davinci-001": "r50k_base",
120
+ "text-similarity-curie-001": "r50k_base",
121
+ "text-similarity-babbage-001": "r50k_base",
122
+ "text-similarity-ada-001": "r50k_base",
123
+ "text-search-davinci-doc-001": "r50k_base",
124
+ "text-search-curie-doc-001": "r50k_base",
125
+ "text-search-babbage-doc-001": "r50k_base",
126
+ "text-search-ada-doc-001": "r50k_base",
127
+ "code-search-babbage-code-001": "r50k_base",
128
+ "code-search-ada-code-001": "r50k_base",
129
+ # open source
130
+ gpt2: "gpt2"
131
+ }
132
+
133
+ MODEL_PREFIX_TO_ENCODING = {
134
+ # reasoning
135
+ "o1-": "o200k_base",
136
+ "o3-": "o200k_base",
137
+ "o4-": "o200k_base",
138
+ # chat
139
+ "gpt-5-": "o200k_base",
140
+ "gpt-4.5-": "o200k_base",
141
+ "gpt-4.1-": "o200k_base",
142
+ "chatgpt-4o-": "o200k_base",
143
+ "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13, etc.
144
+ "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
145
+ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
146
+ "gpt-35-turbo-": "cl100k_base", # Azure deployment name
147
+ "gpt-oss-": "o200k_harmony",
148
+ # fine-tuned
149
+ "ft:gpt-4o": "cl100k_base",
150
+ "ft:gpt-4": "cl100k_base",
151
+ "ft:gpt-3.5-turbo": "cl100k_base",
152
+ "ft:davinci-002": "cl100k_base",
153
+ "ft:babbage-002": "cl100k_base"
154
+ }
155
+ end
156
+ end
@@ -0,0 +1,4 @@
1
+ module TiktokenRuby
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tiktoken_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.15
5
+ platform: aarch64-linux-musl
6
+ authors:
7
+ - IAPark
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2026-01-06 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
+ used by OpenAI. It can be used to count the number of tokens in text before sending
15
+ it to OpenAI APIs.
16
+ email:
17
+ - isaac.a.park@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - ".rspec"
23
+ - ".standard.yml"
24
+ - ".vscode/settings.json"
25
+ - CHANGELOG.md
26
+ - Gemfile
27
+ - Gemfile.lock
28
+ - LICENSE.txt
29
+ - README.md
30
+ - Rakefile
31
+ - doctest_helper.rb
32
+ - lib/tiktoken_ruby.rb
33
+ - lib/tiktoken_ruby/3.2/tiktoken_ruby.so
34
+ - lib/tiktoken_ruby/3.3/tiktoken_ruby.so
35
+ - lib/tiktoken_ruby/3.4/tiktoken_ruby.so
36
+ - lib/tiktoken_ruby/4.0/tiktoken_ruby.so
37
+ - lib/tiktoken_ruby/encoding.rb
38
+ - lib/tiktoken_ruby/version.rb
39
+ - sig/tiktoken_ruby.rbs
40
+ homepage: https://github.com/IAPark/tiktoken_ruby
41
+ licenses:
42
+ - MIT
43
+ metadata:
44
+ homepage_uri: https://github.com/IAPark/tiktoken_ruby
45
+ source_code_uri: https://github.com/IAPark/tiktoken_ruby
46
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '3.2'
56
+ - - "<"
57
+ - !ruby/object:Gem::Version
58
+ version: 4.1.dev
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: 3.4.0
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 3.3.22
67
+ requirements: []
68
+ rubygems_version: 3.5.23
69
+ signing_key:
70
+ specification_version: 4
71
+ summary: Ruby wrapper for Tiktoken
72
+ test_files: []