tiktoken_ruby 0.0.15-aarch64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.vscode/settings.json +11 -0
- data/CHANGELOG.md +35 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +99 -0
- data/LICENSE.txt +21 -0
- data/README.md +94 -0
- data/Rakefile +29 -0
- data/doctest_helper.rb +1 -0
- data/lib/tiktoken_ruby/3.2/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/3.3/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/3.4/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/4.0/tiktoken_ruby.so +0 -0
- data/lib/tiktoken_ruby/encoding.rb +63 -0
- data/lib/tiktoken_ruby/version.rb +5 -0
- data/lib/tiktoken_ruby.rb +156 -0
- data/sig/tiktoken_ruby.rbs +4 -0
- metadata +72 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 8593709eb39d7a795a0a4bb8f3956c112fe20e4b098e32d18ac0d23e91f57bee
|
|
4
|
+
data.tar.gz: 43be73ed68f5e43384430940c860c16ef558d2ccee8ee96bdf3982a9358ad50e
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: '0562749ed6720dca83a1f46aed7b53d4724c42567342f3ae59978cce589ddb4647e9f6a8a80d3d60e5fcd78529741bb8e970ada627eb5bb650dfbd3705fb4aa6'
|
|
7
|
+
data.tar.gz: 85006fe97f6ea2ab88a9ce38f6b0cc931ae7faf2e8983d2c36530a31b7d5270666145411c5dd2078df3931c27b652d0ea635bb730cbafe42b5556b95d264e1f9
|
data/.rspec
ADDED
data/.standard.yml
ADDED
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# [v0.0.15] - 06-01-2026
|
|
2
|
+
## What's Changed
|
|
3
|
+
* No longer true by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/96
|
|
4
|
+
* Test build on version change by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/97
|
|
5
|
+
* Bump actions/cache from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/98
|
|
6
|
+
* Bump rb-sys from 0.9.117 to 0.9.123 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/99
|
|
7
|
+
* Bump the bundler-dependencies group with 2 updates by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/100
|
|
8
|
+
* Add Ruby 4.0 to build matrix by @chubchenko in https://github.com/IAPark/tiktoken_ruby/pull/101
|
|
9
|
+
* Support Ruby 4 by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/102
|
|
10
|
+
|
|
11
|
+
## New Contributors
|
|
12
|
+
* @chubchenko made their first contribution in https://github.com/IAPark/tiktoken_ruby/pull/101
|
|
13
|
+
|
|
14
|
+
**Full Changelog**: https://github.com/IAPark/tiktoken_ruby/compare/v0.0.14.1...v0.0.15
|
|
15
|
+
# [v0.0.14.1] - 20-12-2025
|
|
16
|
+
## What's Changed
|
|
17
|
+
* Cut v0.0.12 by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/78
|
|
18
|
+
* Bump magnus from 0.8.0 to 0.8.1 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/83
|
|
19
|
+
* Bump actions/checkout from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/82
|
|
20
|
+
* Bump standard from 1.50.0 to 1.51.1 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/81
|
|
21
|
+
* Bump actions/upload-artifact from 4 to 5 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/84
|
|
22
|
+
* Bump magnus from 0.8.1 to 0.8.2 in the cargo group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/85
|
|
23
|
+
* Bump the bundler-dependencies group with 2 updates by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/86
|
|
24
|
+
* Support by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/87
|
|
25
|
+
* Bump actions/checkout from 5 to 6 in the github-actions group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/89
|
|
26
|
+
* Bump standard from 1.51.1 to 1.52.0 in the bundler-dependencies group by @dependabot[bot] in https://github.com/IAPark/tiktoken_ruby/pull/88
|
|
27
|
+
* release GVL while encoding / decoding tokens by @tenderworks in https://github.com/IAPark/tiktoken_ruby/pull/90
|
|
28
|
+
* Drop Ruby 3.1 support; automate release process by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/92
|
|
29
|
+
* Rewrite history by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/93
|
|
30
|
+
* Force workflow rebuild by @gjtorikian in https://github.com/IAPark/tiktoken_ruby/pull/94
|
|
31
|
+
|
|
32
|
+
## New Contributors
|
|
33
|
+
* @tenderworks made their first contribution in https://github.com/IAPark/tiktoken_ruby/pull/90
|
|
34
|
+
|
|
35
|
+
**Full Changelog**: https://github.com/IAPark/tiktoken_ruby/compare/v0.0.12...v0.0.14.1
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
tiktoken_ruby (0.0.15)
|
|
5
|
+
rb_sys (~> 0.9)
|
|
6
|
+
|
|
7
|
+
GEM
|
|
8
|
+
remote: https://rubygems.org/
|
|
9
|
+
specs:
|
|
10
|
+
ast (2.4.3)
|
|
11
|
+
diff-lcs (1.6.2)
|
|
12
|
+
json (2.16.0)
|
|
13
|
+
language_server-protocol (3.17.0.5)
|
|
14
|
+
lint_roller (1.1.0)
|
|
15
|
+
minitest (6.0.1)
|
|
16
|
+
prism (~> 1.5)
|
|
17
|
+
parallel (1.27.0)
|
|
18
|
+
parser (3.3.10.0)
|
|
19
|
+
ast (~> 2.4.1)
|
|
20
|
+
racc
|
|
21
|
+
prism (1.6.0)
|
|
22
|
+
racc (1.8.1)
|
|
23
|
+
rainbow (3.1.1)
|
|
24
|
+
rake (13.3.1)
|
|
25
|
+
rake-compiler (1.3.1)
|
|
26
|
+
rake
|
|
27
|
+
rake-compiler-dock (1.10.0)
|
|
28
|
+
rb_sys (0.9.123)
|
|
29
|
+
rake-compiler-dock (= 1.10.0)
|
|
30
|
+
regexp_parser (2.11.3)
|
|
31
|
+
rspec (3.13.2)
|
|
32
|
+
rspec-core (~> 3.13.0)
|
|
33
|
+
rspec-expectations (~> 3.13.0)
|
|
34
|
+
rspec-mocks (~> 3.13.0)
|
|
35
|
+
rspec-core (3.13.6)
|
|
36
|
+
rspec-support (~> 3.13.0)
|
|
37
|
+
rspec-expectations (3.13.5)
|
|
38
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
39
|
+
rspec-support (~> 3.13.0)
|
|
40
|
+
rspec-mocks (3.13.7)
|
|
41
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
42
|
+
rspec-support (~> 3.13.0)
|
|
43
|
+
rspec-support (3.13.6)
|
|
44
|
+
rubocop (1.81.7)
|
|
45
|
+
json (~> 2.3)
|
|
46
|
+
language_server-protocol (~> 3.17.0.2)
|
|
47
|
+
lint_roller (~> 1.1.0)
|
|
48
|
+
parallel (~> 1.10)
|
|
49
|
+
parser (>= 3.3.0.2)
|
|
50
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
51
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
|
52
|
+
rubocop-ast (>= 1.47.1, < 2.0)
|
|
53
|
+
ruby-progressbar (~> 1.7)
|
|
54
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
|
55
|
+
rubocop-ast (1.48.0)
|
|
56
|
+
parser (>= 3.3.7.2)
|
|
57
|
+
prism (~> 1.4)
|
|
58
|
+
rubocop-performance (1.25.0)
|
|
59
|
+
lint_roller (~> 1.1)
|
|
60
|
+
rubocop (>= 1.75.0, < 2.0)
|
|
61
|
+
rubocop-ast (>= 1.38.0, < 2.0)
|
|
62
|
+
ruby-progressbar (1.13.0)
|
|
63
|
+
standard (1.52.0)
|
|
64
|
+
language_server-protocol (~> 3.17.0.2)
|
|
65
|
+
lint_roller (~> 1.0)
|
|
66
|
+
rubocop (~> 1.81.7)
|
|
67
|
+
standard-custom (~> 1.0.0)
|
|
68
|
+
standard-performance (~> 1.8)
|
|
69
|
+
standard-custom (1.0.2)
|
|
70
|
+
lint_roller (~> 1.0)
|
|
71
|
+
rubocop (~> 1.50)
|
|
72
|
+
standard-performance (1.8.0)
|
|
73
|
+
lint_roller (~> 1.1)
|
|
74
|
+
rubocop-performance (~> 1.25.0)
|
|
75
|
+
unicode-display_width (3.2.0)
|
|
76
|
+
unicode-emoji (~> 4.1)
|
|
77
|
+
unicode-emoji (4.2.0)
|
|
78
|
+
yard (0.9.37)
|
|
79
|
+
yard-doctest (0.1.17)
|
|
80
|
+
minitest
|
|
81
|
+
yard
|
|
82
|
+
|
|
83
|
+
PLATFORMS
|
|
84
|
+
arm64-darwin-22
|
|
85
|
+
ruby
|
|
86
|
+
x86_64-darwin-22
|
|
87
|
+
x86_64-linux
|
|
88
|
+
|
|
89
|
+
DEPENDENCIES
|
|
90
|
+
racc
|
|
91
|
+
rake
|
|
92
|
+
rake-compiler
|
|
93
|
+
rspec
|
|
94
|
+
standard
|
|
95
|
+
tiktoken_ruby!
|
|
96
|
+
yard-doctest
|
|
97
|
+
|
|
98
|
+
BUNDLED WITH
|
|
99
|
+
2.6.9
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 IAPark
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
[](https://badge.fury.io/rb/tiktoken_ruby)
|
|
2
|
+
|
|
3
|
+
# tiktoken_ruby
|
|
4
|
+
|
|
5
|
+
[Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
|
|
6
|
+
This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
Install the gem and add to the application's Gemfile by executing:
|
|
11
|
+
|
|
12
|
+
$ bundle add tiktoken_ruby
|
|
13
|
+
|
|
14
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
|
15
|
+
|
|
16
|
+
$ gem install tiktoken_ruby
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
Usage should be very similar to the python library. Here's a simple example
|
|
21
|
+
|
|
22
|
+
Encode and decode text
|
|
23
|
+
|
|
24
|
+
```ruby
|
|
25
|
+
require 'tiktoken_ruby'
|
|
26
|
+
enc = Tiktoken.get_encoding("cl100k_base")
|
|
27
|
+
enc.decode(enc.encode("hello world")) #=> "hello world"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Encoders can also be retrieved by model name
|
|
31
|
+
|
|
32
|
+
```ruby
|
|
33
|
+
require 'tiktoken_ruby'
|
|
34
|
+
|
|
35
|
+
enc = Tiktoken.encoding_for_model("gpt-4")
|
|
36
|
+
enc.encode("hello world").length #=> 2
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Encoding methods
|
|
40
|
+
|
|
41
|
+
There are three methods for encoding text:
|
|
42
|
+
|
|
43
|
+
- `encode_ordinary(text)` - Encodes text, always treating special tokens as ordinary text
|
|
44
|
+
- `encode(text, allowed_special: [])` - Encodes text, treating special tokens as text unless listed in `allowed_special`
|
|
45
|
+
- `encode_with_special_tokens(text)` - Encodes text, recognizing and parsing all special tokens
|
|
46
|
+
|
|
47
|
+
**Special tokens** are control sequences used by OpenAI models, such as `<|endoftext|>`, `<|fim_prefix|>`, `<|fim_middle|>`, and `<|fim_suffix|>`. The encoding methods differ in how they handle these sequences:
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
enc = Tiktoken.get_encoding("cl100k_base")
|
|
51
|
+
text = "Hello<|endoftext|>World"
|
|
52
|
+
|
|
53
|
+
# encode_ordinary: treats <|endoftext|> as literal characters (9 tokens)
|
|
54
|
+
enc.encode_ordinary(text)
|
|
55
|
+
#=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
|
|
56
|
+
|
|
57
|
+
# encode: same as encode_ordinary by default
|
|
58
|
+
enc.encode(text)
|
|
59
|
+
#=> [9906, 27, 91, 8862, 728, 428, 91, 29, 10343]
|
|
60
|
+
|
|
61
|
+
# encode with allowed_special: recognizes the specified special token (3 tokens)
|
|
62
|
+
enc.encode(text, allowed_special: ["<|endoftext|>"])
|
|
63
|
+
#=> [9906, 100257, 10343]
|
|
64
|
+
|
|
65
|
+
# encode_with_special_tokens: recognizes ALL special tokens (3 tokens)
|
|
66
|
+
enc.encode_with_special_tokens(text)
|
|
67
|
+
#=> [9906, 100257, 10343]
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
All methods round-trip correctly through `decode`.
|
|
71
|
+
|
|
72
|
+
## Development
|
|
73
|
+
|
|
74
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
75
|
+
|
|
76
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
|
77
|
+
|
|
78
|
+
## Contributing
|
|
79
|
+
|
|
80
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
|
|
81
|
+
|
|
82
|
+
To get started with development:
|
|
83
|
+
|
|
84
|
+
```sh
|
|
85
|
+
git clone https://github.com/IAPark/tiktoken_ruby.git
|
|
86
|
+
cd tiktoken_ruby
|
|
87
|
+
bundle install
|
|
88
|
+
bundle exec rake compile
|
|
89
|
+
bundle exec rake spec
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "bundler/gem_tasks"
|
|
4
|
+
require "rspec/core/rake_task"
|
|
5
|
+
require "standard/rake"
|
|
6
|
+
require "rake/extensiontask"
|
|
7
|
+
require "rb_sys/extensiontask"
|
|
8
|
+
|
|
9
|
+
GEMSPEC = Gem::Specification.load("tiktoken_ruby.gemspec")
|
|
10
|
+
|
|
11
|
+
RbSys::ExtensionTask.new("tiktoken_ruby", GEMSPEC) do |ext|
|
|
12
|
+
ext.lib_dir = "lib/tiktoken_ruby"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
16
|
+
|
|
17
|
+
task :native, [:platform] do |_t, platform:|
|
|
18
|
+
sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
task build: :compile
|
|
22
|
+
|
|
23
|
+
task default: %i[compile spec standard]
|
|
24
|
+
|
|
25
|
+
# Packaging default (non-precompiled) gem
|
|
26
|
+
require "rubygems/package_task"
|
|
27
|
+
gem_path = Gem::PackageTask.new(GEMSPEC).define
|
|
28
|
+
desc "Package the Ruby gem"
|
|
29
|
+
task "package" => [gem_path]
|
data/doctest_helper.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require "lib/tiktoken_ruby"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class Tiktoken::Encoding
|
|
4
|
+
CACHE_MUTEX = Mutex.new
|
|
5
|
+
|
|
6
|
+
attr_reader :name
|
|
7
|
+
|
|
8
|
+
# This returns a new Tiktoken::Encoding instance for the requested encoding
|
|
9
|
+
# @param encoding [Symbol] The name of the encoding to load
|
|
10
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
|
11
|
+
def self.for_name(encoding)
|
|
12
|
+
Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# This returns a Tiktoken::Encoding instance for the requested encoding
|
|
16
|
+
# It will reuse an existing encoding if it's already been loaded
|
|
17
|
+
# @param encoding [Symbol] The name of the encoding to load
|
|
18
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
|
19
|
+
def self.for_name_cached(encoding)
|
|
20
|
+
CACHE_MUTEX.synchronize do
|
|
21
|
+
@encodings ||= {}
|
|
22
|
+
@encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
|
|
27
|
+
# basically it's unescaped
|
|
28
|
+
# @param text [String] The text to encode
|
|
29
|
+
# @return [Array<Integer>] The encoded tokens
|
|
30
|
+
def encode_ordinary(text)
|
|
31
|
+
@ext_base_bpe.encode_ordinary(text)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
|
|
35
|
+
# as text unless they're in the allowed_special array. It's basically like the text was escaped
|
|
36
|
+
# @param text [String] The text to encode
|
|
37
|
+
# @param allowed_special [Array<String>] An array of special tokens to allow
|
|
38
|
+
# @return [Array<Integer>] The encoded tokens
|
|
39
|
+
def encode(text, allowed_special: [])
|
|
40
|
+
@ext_base_bpe.encode(text, allowed_special)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Encodes the text as a list of integer tokens, including special tokens.
|
|
44
|
+
# @param text [String] The text to encode
|
|
45
|
+
# @return [Array<Integer>] The encoded tokens
|
|
46
|
+
def encode_with_special_tokens(text)
|
|
47
|
+
@ext_base_bpe.encode_with_special_tokens(text)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Decodes the tokens back into text
|
|
51
|
+
# @param tokens [Array<Integer>] The tokens to decode
|
|
52
|
+
# @return [String] The decoded text
|
|
53
|
+
def decode(tokens)
|
|
54
|
+
@ext_base_bpe.decode(tokens)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def initialize(ext_base_bpe, name)
|
|
60
|
+
@ext_base_bpe = ext_base_bpe
|
|
61
|
+
@name = name
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "tiktoken_ruby/version"
|
|
4
|
+
require_relative "tiktoken_ruby/encoding"
|
|
5
|
+
|
|
6
|
+
begin
|
|
7
|
+
RUBY_VERSION =~ /(\d+\.\d+)/
|
|
8
|
+
require_relative "tiktoken_ruby/#{$1}/tiktoken_ruby"
|
|
9
|
+
rescue LoadError
|
|
10
|
+
require_relative "tiktoken_ruby/tiktoken_ruby"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
module Tiktoken
|
|
14
|
+
class << self
|
|
15
|
+
# Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
|
|
16
|
+
# it will reuse the instance of that type that was previous loaded
|
|
17
|
+
# @param name [Symbol|String] The name of the encoding to load
|
|
18
|
+
# @return [Tiktoken::Encoding] The encoding instance
|
|
19
|
+
# @example Encode and decode text
|
|
20
|
+
# enc = Tiktoken.get_encoding("cl100k_base")
|
|
21
|
+
# enc.decode(enc.encode("hello world")) #=> "hello world"
|
|
22
|
+
def get_encoding(name)
|
|
23
|
+
name = name.to_sym
|
|
24
|
+
return nil unless SUPPORTED_ENCODINGS.include?(name)
|
|
25
|
+
|
|
26
|
+
Tiktoken::Encoding.for_name_cached(name)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Gets the encoding for an OpenAI model
|
|
30
|
+
# @param model_name [Symbol|String] The name of the model to get the encoding for
|
|
31
|
+
# @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
|
|
32
|
+
# @example Count tokens for text
|
|
33
|
+
# enc = Tiktoken.encoding_for_model("gpt-4")
|
|
34
|
+
# enc.encode("hello world").length #=> 2
|
|
35
|
+
def encoding_for_model(model_name)
|
|
36
|
+
if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
|
|
37
|
+
return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
_prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
|
|
41
|
+
model_name.start_with?(prefix.to_s)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
if encoding
|
|
45
|
+
get_encoding(encoding)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Lists all the encodings that are supported
|
|
50
|
+
# @return [Array<Symbol>] The list of supported encodings
|
|
51
|
+
def list_encoding_names
|
|
52
|
+
SUPPORTED_ENCODINGS
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Lists all the models that are supported
|
|
56
|
+
# @return [Array<Symbol>] The list of supported models
|
|
57
|
+
def list_model_names
|
|
58
|
+
MODEL_TO_ENCODING_NAME.keys
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
SUPPORTED_ENCODINGS = [
|
|
64
|
+
:r50k_base,
|
|
65
|
+
:p50k_base,
|
|
66
|
+
:p50k_edit,
|
|
67
|
+
:cl100k_base,
|
|
68
|
+
:o200k_base,
|
|
69
|
+
:o200k_harmony
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
# taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
|
|
73
|
+
# that is also MIT licensed but by OpenAI;
|
|
74
|
+
# https://github.com/Congyuwang/tiktoken-rs/blob/main/tiktoken-rs/src/tokenizer.rs#L50
|
|
75
|
+
# is the source of the mapping for the Rust library
|
|
76
|
+
MODEL_TO_ENCODING_NAME = {
|
|
77
|
+
# reasoning
|
|
78
|
+
o1: "o200k_base",
|
|
79
|
+
o3: "o200k_base",
|
|
80
|
+
"o4-mini": "o200k_base",
|
|
81
|
+
# chat
|
|
82
|
+
"gpt-4.1": "o200k_base",
|
|
83
|
+
"chatgpt-4o": "o200k_base",
|
|
84
|
+
"gpt-4o": "o200k_base",
|
|
85
|
+
"gpt-4": "cl100k_base",
|
|
86
|
+
"gpt-3.5-turbo": "cl100k_base",
|
|
87
|
+
"gpt-3.5": "cl100k_base", # Common shorthand
|
|
88
|
+
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
|
89
|
+
# base
|
|
90
|
+
"davinci-002": "cl100k_base",
|
|
91
|
+
"babbage-002": "cl100k_base",
|
|
92
|
+
# embeddings
|
|
93
|
+
"text-embedding-ada-002": "cl100k_base",
|
|
94
|
+
"text-embedding-3-small": "cl100k_base",
|
|
95
|
+
"text-embedding-3-large": "cl100k_base",
|
|
96
|
+
# DEPRECATED MODELS
|
|
97
|
+
# text (DEPRECATED)
|
|
98
|
+
"text-davinci-003": "p50k_base",
|
|
99
|
+
"text-davinci-002": "p50k_base",
|
|
100
|
+
"text-davinci-001": "r50k_base",
|
|
101
|
+
"text-curie-001": "r50k_base",
|
|
102
|
+
"text-babbage-001": "r50k_base",
|
|
103
|
+
"text-ada-001": "r50k_base",
|
|
104
|
+
davinci: "r50k_base",
|
|
105
|
+
curie: "r50k_base",
|
|
106
|
+
babbage: "r50k_base",
|
|
107
|
+
ada: "r50k_base",
|
|
108
|
+
# code (DEPRECATED)
|
|
109
|
+
"code-davinci-002": "p50k_base",
|
|
110
|
+
"code-davinci-001": "p50k_base",
|
|
111
|
+
"code-cushman-002": "p50k_base",
|
|
112
|
+
"code-cushman-001": "p50k_base",
|
|
113
|
+
"davinci-codex": "p50k_base",
|
|
114
|
+
"cushman-codex": "p50k_base",
|
|
115
|
+
# edit (DEPRECATED)
|
|
116
|
+
"text-davinci-edit-001": "p50k_edit",
|
|
117
|
+
"code-davinci-edit-001": "p50k_edit",
|
|
118
|
+
# old embeddings (DEPRECATED)
|
|
119
|
+
"text-similarity-davinci-001": "r50k_base",
|
|
120
|
+
"text-similarity-curie-001": "r50k_base",
|
|
121
|
+
"text-similarity-babbage-001": "r50k_base",
|
|
122
|
+
"text-similarity-ada-001": "r50k_base",
|
|
123
|
+
"text-search-davinci-doc-001": "r50k_base",
|
|
124
|
+
"text-search-curie-doc-001": "r50k_base",
|
|
125
|
+
"text-search-babbage-doc-001": "r50k_base",
|
|
126
|
+
"text-search-ada-doc-001": "r50k_base",
|
|
127
|
+
"code-search-babbage-code-001": "r50k_base",
|
|
128
|
+
"code-search-ada-code-001": "r50k_base",
|
|
129
|
+
# open source
|
|
130
|
+
gpt2: "gpt2"
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
MODEL_PREFIX_TO_ENCODING = {
|
|
134
|
+
# reasoning
|
|
135
|
+
"o1-": "o200k_base",
|
|
136
|
+
"o3-": "o200k_base",
|
|
137
|
+
"o4-": "o200k_base",
|
|
138
|
+
# chat
|
|
139
|
+
"gpt-5-": "o200k_base",
|
|
140
|
+
"gpt-4.5-": "o200k_base",
|
|
141
|
+
"gpt-4.1-": "o200k_base",
|
|
142
|
+
"chatgpt-4o-": "o200k_base",
|
|
143
|
+
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13, etc.
|
|
144
|
+
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
|
145
|
+
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
|
146
|
+
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
|
147
|
+
"gpt-oss-": "o200k_harmony",
|
|
148
|
+
# fine-tuned
|
|
149
|
+
"ft:gpt-4o": "cl100k_base",
|
|
150
|
+
"ft:gpt-4": "cl100k_base",
|
|
151
|
+
"ft:gpt-3.5-turbo": "cl100k_base",
|
|
152
|
+
"ft:davinci-002": "cl100k_base",
|
|
153
|
+
"ft:babbage-002": "cl100k_base"
|
|
154
|
+
}
|
|
155
|
+
end
|
|
156
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: tiktoken_ruby
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.15
|
|
5
|
+
platform: aarch64-linux-musl
|
|
6
|
+
authors:
|
|
7
|
+
- IAPark
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-01-06 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
|
|
14
|
+
used by OpenAI. It can be used to count the number of tokens in text before sending
|
|
15
|
+
it to OpenAI APIs.
|
|
16
|
+
email:
|
|
17
|
+
- isaac.a.park@gmail.com
|
|
18
|
+
executables: []
|
|
19
|
+
extensions: []
|
|
20
|
+
extra_rdoc_files: []
|
|
21
|
+
files:
|
|
22
|
+
- ".rspec"
|
|
23
|
+
- ".standard.yml"
|
|
24
|
+
- ".vscode/settings.json"
|
|
25
|
+
- CHANGELOG.md
|
|
26
|
+
- Gemfile
|
|
27
|
+
- Gemfile.lock
|
|
28
|
+
- LICENSE.txt
|
|
29
|
+
- README.md
|
|
30
|
+
- Rakefile
|
|
31
|
+
- doctest_helper.rb
|
|
32
|
+
- lib/tiktoken_ruby.rb
|
|
33
|
+
- lib/tiktoken_ruby/3.2/tiktoken_ruby.so
|
|
34
|
+
- lib/tiktoken_ruby/3.3/tiktoken_ruby.so
|
|
35
|
+
- lib/tiktoken_ruby/3.4/tiktoken_ruby.so
|
|
36
|
+
- lib/tiktoken_ruby/4.0/tiktoken_ruby.so
|
|
37
|
+
- lib/tiktoken_ruby/encoding.rb
|
|
38
|
+
- lib/tiktoken_ruby/version.rb
|
|
39
|
+
- sig/tiktoken_ruby.rbs
|
|
40
|
+
homepage: https://github.com/IAPark/tiktoken_ruby
|
|
41
|
+
licenses:
|
|
42
|
+
- MIT
|
|
43
|
+
metadata:
|
|
44
|
+
homepage_uri: https://github.com/IAPark/tiktoken_ruby
|
|
45
|
+
source_code_uri: https://github.com/IAPark/tiktoken_ruby
|
|
46
|
+
documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
|
|
47
|
+
post_install_message:
|
|
48
|
+
rdoc_options: []
|
|
49
|
+
require_paths:
|
|
50
|
+
- lib
|
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
52
|
+
requirements:
|
|
53
|
+
- - ">="
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: '3.2'
|
|
56
|
+
- - "<"
|
|
57
|
+
- !ruby/object:Gem::Version
|
|
58
|
+
version: 4.1.dev
|
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
|
+
requirements:
|
|
61
|
+
- - ">="
|
|
62
|
+
- !ruby/object:Gem::Version
|
|
63
|
+
version: 3.4.0
|
|
64
|
+
- - ">="
|
|
65
|
+
- !ruby/object:Gem::Version
|
|
66
|
+
version: 3.3.22
|
|
67
|
+
requirements: []
|
|
68
|
+
rubygems_version: 3.5.23
|
|
69
|
+
signing_key:
|
|
70
|
+
specification_version: 4
|
|
71
|
+
summary: Ruby wrapper for Tiktoken
|
|
72
|
+
test_files: []
|