ru_token 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/CHANGELOG.md +30 -0
- data/Cargo.lock +439 -0
- data/Cargo.toml +7 -0
- data/LICENSE +674 -0
- data/README.md +50 -0
- data/Rakefile +22 -0
- data/ext/ru_token/.cargo/config.toml +9 -0
- data/ext/ru_token/Cargo.toml +13 -0
- data/ext/ru_token/extconf.rb +6 -0
- data/ext/ru_token/src/lib.rs +60 -0
- data/lib/ru_token/version.rb +5 -0
- data/lib/ru_token.rb +29 -0
- data/sig/ru_token.rbs +4 -0
- metadata +107 -0
data/README.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# RuToken 🚀
|
2
|
+
RuToken provides a high-performance, native Ruby interface for counting tokens using the powerful tiktoken-rs library. It leverages the speed of Rust to offer a fast and efficient way to calculate token counts for various OpenAI models.
|
3
|
+
|
4
|
+
The gem ships with pre-compiled native extensions (except Windows), so your end-users don't need a Rust toolchain installed.
|
5
|
+
|
6
|
+
Features
|
7
|
+
High Performance ⚡️: Uses a native Rust implementation for blazing-fast tokenization.
|
8
|
+
|
9
|
+
Simple API: A clean and straightforward interface for counting tokens.
|
10
|
+
|
11
|
+
Extensive Model Support: Includes tokenizers for all modern and legacy OpenAI models, recognizing dozens of model aliases automatically.
|
12
|
+
|
13
|
+
Pre-compiled: Ships with binaries for major platforms (Linux, macOS, Windows) and Ruby versions, removing the need for local compilation in production.
|
14
|
+
|
15
|
+
## Count tokens for a specific model (the model keyword is required)
|
16
|
+
`count = RuToken::Tokenizer.count(text, model: "gpt-4o")`
|
17
|
+
#### => 13
|
18
|
+
|
19
|
+
## The gem recognizes many aliases, including older models
|
20
|
+
`count = RuToken::Tokenizer.count(text, model: "text-davinci-003")`
|
21
|
+
#### => 13
|
22
|
+
If you provide an unsupported model name, the gem will raise an ArgumentError.
|
23
|
+
|
24
|
+
Supported Models
|
25
|
+
The gem automatically maps dozens of model names and prefixes to the correct underlying tokenizer. You don't need to know the tokenizer's base name (e.g., cl100k_base); just use the model name you're working with.
|
26
|
+
|
27
|
+
o200k_base Models (e.g., GPT-4o)
|
28
|
+
cl100k_base Models (e.g., GPT-4, GPT-3.5)
|
29
|
+
p50k_base Models (e.g., text-davinci-003)
|
30
|
+
r50k_base Models (e.g., GPT-2)
|
31
|
+
p50k_edit Models
|
32
|
+
Development
|
33
|
+
After checking out the repo, set up your environment:
|
34
|
+
|
35
|
+
Bash
|
36
|
+
|
37
|
+
## Install Ruby and Rust dependencies
|
38
|
+
Ruby >= 2.7 required
|
39
|
+
|
40
|
+
`bundle install`
|
41
|
+
|
42
|
+
## Compile the Rust extension and run tests
|
43
|
+
bundle exec rake
|
44
|
+
To just compile the extension without running tests, use bundle exec rake compile. You can also open an interactive console for experimentation with bin/console.
|
45
|
+
|
46
|
+
Contributing
|
47
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/LoganBresnahan/ru_token. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the code of conduct.
|
48
|
+
|
49
|
+
License
|
50
|
+
The gem is available as open source under the terms of the GNU General Public License v3.0.
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "bundler/gem_tasks"
|
5
|
+
require "rake/clean"
|
6
|
+
require "rake/extensiontask"
|
7
|
+
require "rspec/core/rake_task"
|
8
|
+
|
9
|
+
spec = Gem::Specification.load("ru_token.gemspec")
|
10
|
+
|
11
|
+
Rake::ExtensionTask.new("ru_token", spec) do |ext|
|
12
|
+
ext.lib_dir = "lib/ru_token"
|
13
|
+
end
|
14
|
+
|
15
|
+
# Define a task to run your specs
|
16
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
|
+
|
18
|
+
# Make the 'spec' task depend on the 'compile' task
|
19
|
+
task spec: :compile
|
20
|
+
|
21
|
+
# Set the default task to run specs
|
22
|
+
task default: :spec
|
@@ -0,0 +1,9 @@
|
|
1
|
+
# This file tells Rust's build system (Cargo) how to cross-compile.
|
2
|
+
|
3
|
+
# Configuration for the aarch64-linux target
|
4
|
+
[target.aarch64-unknown-linux-gnu]
|
5
|
+
linker = "aarch64-linux-gnu-gcc"
|
6
|
+
|
7
|
+
# Configuration for the x86_64-linux target
|
8
|
+
[target.x86_64-unknown-linux-gnu]
|
9
|
+
linker = "gcc"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
[package]
|
2
|
+
name = "ru_token"
|
3
|
+
version = "0.1.0"
|
4
|
+
edition = "2021"
|
5
|
+
authors = ["Logan Bresnahan <loganbbres@gmail.com>"]
|
6
|
+
publish = false
|
7
|
+
|
8
|
+
[lib]
|
9
|
+
crate-type = ["cdylib"]
|
10
|
+
|
11
|
+
[dependencies]
|
12
|
+
magnus = { version = "0.6" }
|
13
|
+
tiktoken-rs = { version= "0.7" }
|
@@ -0,0 +1,60 @@
|
|
1
|
+
use magnus::{define_module, exception, function, Error};
|
2
|
+
use tiktoken_rs::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
|
3
|
+
|
4
|
+
// Helper function to get the correct tokenizer based on the model name.
|
5
|
+
// This logic is adapted from the tiktoken-rs library.
|
6
|
+
fn get_bpe_from_model(model: &str) -> Result<CoreBPE, Error> {
|
7
|
+
let tokenizer = match model {
|
8
|
+
// --- O200k Base Models ---
|
9
|
+
"o200k_base" | "gpt-4.1" | "chatgpt-4o-latest" | "gpt-4o" => o200k_base(),
|
10
|
+
// --- Cl100k Base Models ---
|
11
|
+
"cl100k_base" | "gpt-4" | "gpt-3.5-turbo" | "gpt-3.5" | "gpt-35-turbo"
|
12
|
+
| "davinci-002" | "babbage-002" | "text-embedding-ada-002"
|
13
|
+
| "text-embedding-3-small" | "text-embedding-3-large" => cl100k_base(),
|
14
|
+
// --- P50k Base Models ---
|
15
|
+
"p50k_base" | "text-davinci-003" | "text-davinci-002" | "code-davinci-002"
|
16
|
+
| "code-davinci-001" | "code-cushman-002" | "code-cushman-001"
|
17
|
+
| "davinci-codex" | "cushman-codex" => p50k_base(),
|
18
|
+
// --- R50k Base (GPT-2) Models ---
|
19
|
+
"r50k_base" | "gpt2" | "gpt-2" | "text-davinci-001" | "text-curie-001"
|
20
|
+
| "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada"
|
21
|
+
| "text-similarity-davinci-001" | "text-similarity-curie-001"
|
22
|
+
| "text-similarity-babbage-001" | "text-similarity-ada-001"
|
23
|
+
| "text-search-davinci-doc-001" | "text-search-curie-doc-001"
|
24
|
+
| "text-search-babbage-doc-001" | "text-search-ada-doc-001"
|
25
|
+
| "code-search-babbage-code-001" | "code-search-ada-code-001" => r50k_base(),
|
26
|
+
// --- P50k Edit Models ---
|
27
|
+
"p50k_edit" | "text-davinci-edit-001" | "code-davinci-edit-001" => p50k_edit(),
|
28
|
+
// --- Fallback for Prefixes ---
|
29
|
+
_ => {
|
30
|
+
if model.starts_with("o1-") || model.starts_with("o3-") || model.starts_with("o4-")
|
31
|
+
|| model.starts_with("gpt-4.1-") || model.starts_with("chatgpt-4o-")
|
32
|
+
|| model.starts_with("gpt-4o-") || model.starts_with("ft:gpt-4o") {
|
33
|
+
o200k_base()
|
34
|
+
} else if model.starts_with("gpt-4-") || model.starts_with("gpt-3.5-turbo-")
|
35
|
+
|| model.starts_with("gpt-35-turbo-") || model.starts_with("ft:gpt-4")
|
36
|
+
|| model.starts_with("ft:gpt-3.5-turbo") || model.starts_with("ft:davinci-002")
|
37
|
+
|| model.starts_with("ft:babbage-002") {
|
38
|
+
cl100k_base()
|
39
|
+
} else {
|
40
|
+
let err_msg = format!("Model '{}' not supported.", model);
|
41
|
+
return Err(Error::new(exception::arg_error(), err_msg));
|
42
|
+
}
|
43
|
+
}
|
44
|
+
};
|
45
|
+
tokenizer.map_err(|e| Error::new(exception::runtime_error(), e.to_string()))
|
46
|
+
}
|
47
|
+
|
48
|
+
// This function is exposed to Ruby.
|
49
|
+
fn count_tokens(model: String, text: String) -> Result<usize, Error> {
|
50
|
+
let bpe = get_bpe_from_model(&model)?;
|
51
|
+
let tokens = bpe.encode_with_special_tokens(&text);
|
52
|
+
Ok(tokens.len())
|
53
|
+
}
|
54
|
+
|
55
|
+
#[magnus::init]
|
56
|
+
fn init() -> Result<(), Error> {
|
57
|
+
let module = define_module("RuToken")?;
|
58
|
+
module.define_module_function("count_tokens", function!(count_tokens, 2))?;
|
59
|
+
Ok(())
|
60
|
+
}
|
data/lib/ru_token.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "ru_token/version"
|
4
|
+
|
5
|
+
begin
|
6
|
+
require "ru_token/ru_token"
|
7
|
+
rescue LoadError
|
8
|
+
warn "Failed to load ru_token native extension."
|
9
|
+
end
|
10
|
+
|
11
|
+
module RuToken
|
12
|
+
class Error < StandardError; end
|
13
|
+
|
14
|
+
# This is the main public interface for the gem.
|
15
|
+
class Tokenizer
|
16
|
+
# Counts tokens for a given text using a specified model name.
|
17
|
+
# The underlying Rust extension will handle mapping the model name
|
18
|
+
# to the correct tokenizer.
|
19
|
+
#
|
20
|
+
# @param text [String] The text to tokenize.
|
21
|
+
# @param model [String] The name of the model (e.g., "gpt-4o", "gpt-3.5-turbo").
|
22
|
+
# @return [Integer] The number of tokens.
|
23
|
+
def self.count(text, model:)
|
24
|
+
# This calls the `count_tokens` function from our Rust code,
|
25
|
+
# passing both the model and the text.
|
26
|
+
RuToken.count_tokens(model, text.to_s)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/sig/ru_token.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ru_token
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Logan Bresnahan
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2025-07-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '13.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '13.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: RuToken is a Ruby gem that wraps the tiktoken Rust library, enabling
|
56
|
+
fast and efficient tokenization for OpenAI models. It supports multiple models including
|
57
|
+
o200k_base, cl100k_base, p50k_base, and r50k_base.
|
58
|
+
email:
|
59
|
+
- loganbbres@gmail.com
|
60
|
+
executables: []
|
61
|
+
extensions:
|
62
|
+
- ext/ru_token/Cargo.toml
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- ".rspec"
|
66
|
+
- CHANGELOG.md
|
67
|
+
- Cargo.lock
|
68
|
+
- Cargo.toml
|
69
|
+
- LICENSE
|
70
|
+
- README.md
|
71
|
+
- Rakefile
|
72
|
+
- ext/ru_token/.cargo/config.toml
|
73
|
+
- ext/ru_token/Cargo.toml
|
74
|
+
- ext/ru_token/extconf.rb
|
75
|
+
- ext/ru_token/src/lib.rs
|
76
|
+
- lib/ru_token.rb
|
77
|
+
- lib/ru_token/version.rb
|
78
|
+
- sig/ru_token.rbs
|
79
|
+
homepage: https://github.com/LoganBresnahan/ru_token
|
80
|
+
licenses:
|
81
|
+
- GPL-3.0-or-later
|
82
|
+
metadata:
|
83
|
+
allowed_push_host: https://rubygems.org
|
84
|
+
homepage_uri: https://github.com/LoganBresnahan/ru_token
|
85
|
+
source_code_uri: https://github.com/LoganBresnahan/ru_token
|
86
|
+
changelog_uri: https://github.com/LoganBresnahan/ru_token/blob/main/CHANGELOG.md
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: 2.7.0
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 3.0.0
|
101
|
+
requirements: []
|
102
|
+
rubygems_version: 3.3.26
|
103
|
+
signing_key:
|
104
|
+
specification_version: 4
|
105
|
+
summary: Ruby wrapper for the tiktoken Rust library, providing fast tokenization for
|
106
|
+
OpenAI models.
|
107
|
+
test_files: []
|