ru_token 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+ # RuToken 🚀
2
+ RuToken provides a high-performance, native Ruby interface for counting tokens using the powerful tiktoken-rs library. It leverages the speed of Rust to offer a fast and efficient way to calculate token counts for various OpenAI models.
3
+
4
+ The gem ships with pre-compiled native extensions (except Windows), so your end-users don't need a Rust toolchain installed.
5
+
6
+ Features
7
+ High Performance ⚡️: Uses a native Rust implementation for blazing-fast tokenization.
8
+
9
+ Simple API: A clean and straightforward interface for counting tokens.
10
+
11
+ Extensive Model Support: Includes tokenizers for all modern and legacy OpenAI models, recognizing dozens of model aliases automatically.
12
+
13
+ Pre-compiled: Ships with binaries for major platforms (Linux, macOS, Windows) and Ruby versions, removing the need for local compilation in production.
14
+
15
+ ## Count tokens for a specific model (the model keyword is required)
16
+ `count = RuToken::Tokenizer.count(text, model: "gpt-4o")`
17
+ #### => 13
18
+
19
+ ## The gem recognizes many aliases, including older models
20
+ `count = RuToken::Tokenizer.count(text, model: "text-davinci-003")`
21
+ #### => 13
22
+ If you provide an unsupported model name, the gem will raise an ArgumentError.
23
+
24
+ Supported Models
25
+ The gem automatically maps dozens of model names and prefixes to the correct underlying tokenizer. You don't need to know the tokenizer's base name (e.g., cl100k_base); just use the model name you're working with.
26
+
27
+ o200k_base Models (e.g., GPT-4o)
28
+ cl100k_base Models (e.g., GPT-4, GPT-3.5)
29
+ p50k_base Models (e.g., text-davinci-003)
30
+ r50k_base Models (e.g., GPT-2)
31
+ p50k_edit Models
32
+ Development
33
+ After checking out the repo, set up your environment:
34
+
35
+ Bash
36
+
37
+ ## Install Ruby and Rust dependencies
38
+ Ruby >= 2.7 required
39
+
40
+ `bundle install`
41
+
42
+ ## Compile the Rust extension and run tests
43
+ bundle exec rake
44
+ To just compile the extension without running tests, use bundle exec rake compile. You can also open an interactive console for experimentation with bin/console.
45
+
46
+ Contributing
47
+ Bug reports and pull requests are welcome on GitHub at https://github.com/LoganBresnahan/ru_token. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the code of conduct.
48
+
49
+ License
50
+ The gem is available as open source under the terms of the GNU General Public License v3.0.
data/Rakefile ADDED
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/setup"
4
+ require "bundler/gem_tasks"
5
+ require "rake/clean"
6
+ require "rake/extensiontask"
7
+ require "rspec/core/rake_task"
8
+
9
+ spec = Gem::Specification.load("ru_token.gemspec")
10
+
11
+ Rake::ExtensionTask.new("ru_token", spec) do |ext|
12
+ ext.lib_dir = "lib/ru_token"
13
+ end
14
+
15
+ # Define a task to run your specs
16
+ RSpec::Core::RakeTask.new(:spec)
17
+
18
+ # Make the 'spec' task depend on the 'compile' task
19
+ task spec: :compile
20
+
21
+ # Set the default task to run specs
22
+ task default: :spec
@@ -0,0 +1,9 @@
1
+ # This file tells Rust's build system (Cargo) how to cross-compile.
2
+
3
+ # Configuration for the aarch64-linux target
4
+ [target.aarch64-unknown-linux-gnu]
5
+ linker = "aarch64-linux-gnu-gcc"
6
+
7
+ # Configuration for the x86_64-linux target
8
+ [target.x86_64-unknown-linux-gnu]
9
+ linker = "gcc"
@@ -0,0 +1,13 @@
1
+ [package]
2
+ name = "ru_token"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ authors = ["Logan Bresnahan <loganbbres@gmail.com>"]
6
+ publish = false
7
+
8
+ [lib]
9
+ crate-type = ["cdylib"]
10
+
11
+ [dependencies]
12
+ magnus = { version = "0.6" }
13
+ tiktoken-rs = { version= "0.7" }
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+ require "rb_sys/mkmf"
5
+
6
+ create_rust_makefile("ru_token/ru_token")
@@ -0,0 +1,60 @@
1
+ use magnus::{define_module, exception, function, Error};
2
+ use tiktoken_rs::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
3
+
4
+ // Helper function to get the correct tokenizer based on the model name.
5
+ // This logic is adapted from the tiktoken-rs library.
6
+ fn get_bpe_from_model(model: &str) -> Result<CoreBPE, Error> {
7
+ let tokenizer = match model {
8
+ // --- O200k Base Models ---
9
+ "o200k_base" | "gpt-4.1" | "chatgpt-4o-latest" | "gpt-4o" => o200k_base(),
10
+ // --- Cl100k Base Models ---
11
+ "cl100k_base" | "gpt-4" | "gpt-3.5-turbo" | "gpt-3.5" | "gpt-35-turbo"
12
+ | "davinci-002" | "babbage-002" | "text-embedding-ada-002"
13
+ | "text-embedding-3-small" | "text-embedding-3-large" => cl100k_base(),
14
+ // --- P50k Base Models ---
15
+ "p50k_base" | "text-davinci-003" | "text-davinci-002" | "code-davinci-002"
16
+ | "code-davinci-001" | "code-cushman-002" | "code-cushman-001"
17
+ | "davinci-codex" | "cushman-codex" => p50k_base(),
18
+ // --- R50k Base (GPT-2) Models ---
19
+ "r50k_base" | "gpt2" | "gpt-2" | "text-davinci-001" | "text-curie-001"
20
+ | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada"
21
+ | "text-similarity-davinci-001" | "text-similarity-curie-001"
22
+ | "text-similarity-babbage-001" | "text-similarity-ada-001"
23
+ | "text-search-davinci-doc-001" | "text-search-curie-doc-001"
24
+ | "text-search-babbage-doc-001" | "text-search-ada-doc-001"
25
+ | "code-search-babbage-code-001" | "code-search-ada-code-001" => r50k_base(),
26
+ // --- P50k Edit Models ---
27
+ "p50k_edit" | "text-davinci-edit-001" | "code-davinci-edit-001" => p50k_edit(),
28
+ // --- Fallback for Prefixes ---
29
+ _ => {
30
+ if model.starts_with("o1-") || model.starts_with("o3-") || model.starts_with("o4-")
31
+ || model.starts_with("gpt-4.1-") || model.starts_with("chatgpt-4o-")
32
+ || model.starts_with("gpt-4o-") || model.starts_with("ft:gpt-4o") {
33
+ o200k_base()
34
+ } else if model.starts_with("gpt-4-") || model.starts_with("gpt-3.5-turbo-")
35
+ || model.starts_with("gpt-35-turbo-") || model.starts_with("ft:gpt-4")
36
+ || model.starts_with("ft:gpt-3.5-turbo") || model.starts_with("ft:davinci-002")
37
+ || model.starts_with("ft:babbage-002") {
38
+ cl100k_base()
39
+ } else {
40
+ let err_msg = format!("Model '{}' not supported.", model);
41
+ return Err(Error::new(exception::arg_error(), err_msg));
42
+ }
43
+ }
44
+ };
45
+ tokenizer.map_err(|e| Error::new(exception::runtime_error(), e.to_string()))
46
+ }
47
+
48
+ // This function is exposed to Ruby.
49
+ fn count_tokens(model: String, text: String) -> Result<usize, Error> {
50
+ let bpe = get_bpe_from_model(&model)?;
51
+ let tokens = bpe.encode_with_special_tokens(&text);
52
+ Ok(tokens.len())
53
+ }
54
+
55
+ #[magnus::init]
56
+ fn init() -> Result<(), Error> {
57
+ let module = define_module("RuToken")?;
58
+ module.define_module_function("count_tokens", function!(count_tokens, 2))?;
59
+ Ok(())
60
+ }
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RuToken
4
+ VERSION = "0.1.1"
5
+ end
data/lib/ru_token.rb ADDED
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "ru_token/version"
4
+
5
+ begin
6
+ require "ru_token/ru_token"
7
+ rescue LoadError
8
+ warn "Failed to load ru_token native extension."
9
+ end
10
+
11
+ module RuToken
12
+ class Error < StandardError; end
13
+
14
+ # This is the main public interface for the gem.
15
+ class Tokenizer
16
+ # Counts tokens for a given text using a specified model name.
17
+ # The underlying Rust extension will handle mapping the model name
18
+ # to the correct tokenizer.
19
+ #
20
+ # @param text [String] The text to tokenize.
21
+ # @param model [String] The name of the model (e.g., "gpt-4o", "gpt-3.5-turbo").
22
+ # @return [Integer] The number of tokens.
23
+ def self.count(text, model:)
24
+ # This calls the `count_tokens` function from our Rust code,
25
+ # passing both the model and the text.
26
+ RuToken.count_tokens(model, text.to_s)
27
+ end
28
+ end
29
+ end
data/sig/ru_token.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module RuToken
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ru_token
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Logan Bresnahan
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-07-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: RuToken is a Ruby gem that wraps the tiktoken Rust library, enabling
56
+ fast and efficient tokenization for OpenAI models. It supports multiple models including
57
+ o200k_base, cl100k_base, p50k_base, and r50k_base.
58
+ email:
59
+ - loganbbres@gmail.com
60
+ executables: []
61
+ extensions:
62
+ - ext/ru_token/Cargo.toml
63
+ extra_rdoc_files: []
64
+ files:
65
+ - ".rspec"
66
+ - CHANGELOG.md
67
+ - Cargo.lock
68
+ - Cargo.toml
69
+ - LICENSE
70
+ - README.md
71
+ - Rakefile
72
+ - ext/ru_token/.cargo/config.toml
73
+ - ext/ru_token/Cargo.toml
74
+ - ext/ru_token/extconf.rb
75
+ - ext/ru_token/src/lib.rs
76
+ - lib/ru_token.rb
77
+ - lib/ru_token/version.rb
78
+ - sig/ru_token.rbs
79
+ homepage: https://github.com/LoganBresnahan/ru_token
80
+ licenses:
81
+ - GPL-3.0-or-later
82
+ metadata:
83
+ allowed_push_host: https://rubygems.org
84
+ homepage_uri: https://github.com/LoganBresnahan/ru_token
85
+ source_code_uri: https://github.com/LoganBresnahan/ru_token
86
+ changelog_uri: https://github.com/LoganBresnahan/ru_token/blob/main/CHANGELOG.md
87
+ post_install_message:
88
+ rdoc_options: []
89
+ require_paths:
90
+ - lib
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: 2.7.0
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: 3.0.0
101
+ requirements: []
102
+ rubygems_version: 3.3.26
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Ruby wrapper for the tiktoken Rust library, providing fast tokenization for
106
+ OpenAI models.
107
+ test_files: []