rz_tiktoken_ruby 0.0.6-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b223bffcd5dedd1103f76f70974bb824dc9a9c0e2b93f8c485b54ddeaf59f118
4
+ data.tar.gz: 265b4a041ae51b8c0e605997c839738da55d8db5219f55259c020bfc183268f0
5
+ SHA512:
6
+ metadata.gz: a528494705ef899900cf0cbec3360edcff82264b9683bb35126cff8434692facf91604ee0591c4632506c33094adf35b5526686260890dedf1227ce38ebd77af
7
+ data.tar.gz: d8ea43abb212931afbf84773e82dd3c32f89263da9f0382772e35a203c7eb1528b5e12e7b3f400131e0863fa29f4adf29d5204f03203237f8155756a25614a30
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.standard.yml ADDED
@@ -0,0 +1,3 @@
1
+ # For available configuration options, see:
2
+ # https://github.com/testdouble/standard
3
+ ruby_version: 2.6
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in tiktoken_ruby.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+
10
+ gem "rake-compiler"
11
+
12
+ gem "rspec", "~> 3.0"
13
+
14
+ gem "standard", "~> 1.3"
15
+
16
+ gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock ADDED
@@ -0,0 +1,93 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rz_tiktoken_ruby (0.0.6)
5
+ rb_sys (~> 0.9.81)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ ast (2.4.2)
11
+ base64 (0.1.1)
12
+ diff-lcs (1.5.0)
13
+ json (2.6.3)
14
+ language_server-protocol (3.17.0.3)
15
+ lint_roller (1.1.0)
16
+ minitest (5.19.0)
17
+ parallel (1.23.0)
18
+ parser (3.2.2.3)
19
+ ast (~> 2.4.1)
20
+ racc
21
+ racc (1.7.1)
22
+ rainbow (3.1.1)
23
+ rake (13.0.6)
24
+ rake-compiler (1.2.5)
25
+ rake
26
+ rb_sys (0.9.81)
27
+ regexp_parser (2.8.1)
28
+ rexml (3.2.6)
29
+ rspec (3.12.0)
30
+ rspec-core (~> 3.12.0)
31
+ rspec-expectations (~> 3.12.0)
32
+ rspec-mocks (~> 3.12.0)
33
+ rspec-core (3.12.2)
34
+ rspec-support (~> 3.12.0)
35
+ rspec-expectations (3.12.3)
36
+ diff-lcs (>= 1.2.0, < 2.0)
37
+ rspec-support (~> 3.12.0)
38
+ rspec-mocks (3.12.6)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.12.0)
41
+ rspec-support (3.12.1)
42
+ rubocop (1.56.1)
43
+ base64 (~> 0.1.1)
44
+ json (~> 2.3)
45
+ language_server-protocol (>= 3.17.0)
46
+ parallel (~> 1.10)
47
+ parser (>= 3.2.2.3)
48
+ rainbow (>= 2.2.2, < 4.0)
49
+ regexp_parser (>= 1.8, < 3.0)
50
+ rexml (>= 3.2.5, < 4.0)
51
+ rubocop-ast (>= 1.28.1, < 2.0)
52
+ ruby-progressbar (~> 1.7)
53
+ unicode-display_width (>= 2.4.0, < 3.0)
54
+ rubocop-ast (1.29.0)
55
+ parser (>= 3.2.1.0)
56
+ rubocop-performance (1.19.0)
57
+ rubocop (>= 1.7.0, < 2.0)
58
+ rubocop-ast (>= 0.4.0)
59
+ ruby-progressbar (1.13.0)
60
+ standard (1.31.0)
61
+ language_server-protocol (~> 3.17.0.2)
62
+ lint_roller (~> 1.0)
63
+ rubocop (~> 1.56.0)
64
+ standard-custom (~> 1.0.0)
65
+ standard-performance (~> 1.2)
66
+ standard-custom (1.0.2)
67
+ lint_roller (~> 1.0)
68
+ rubocop (~> 1.50)
69
+ standard-performance (1.2.0)
70
+ lint_roller (~> 1.1)
71
+ rubocop-performance (~> 1.19.0)
72
+ unicode-display_width (2.4.2)
73
+ yard (0.9.34)
74
+ yard-doctest (0.1.17)
75
+ minitest
76
+ yard
77
+
78
+ PLATFORMS
79
+ arm64-darwin-22
80
+ ruby
81
+ x86_64-darwin-22
82
+ x86_64-linux
83
+
84
+ DEPENDENCIES
85
+ rake (~> 13.0)
86
+ rake-compiler
87
+ rspec (~> 3.0)
88
+ rz_tiktoken_ruby!
89
+ standard (~> 1.3)
90
+ yard-doctest (~> 0.1.17)
91
+
92
+ BUNDLED WITH
93
+ 2.4.6
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 IAPark
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,59 @@
1
+ [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
2
+ # tiktoken_ruby
3
+
4
+ [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
5
+ This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
6
+
7
+ ## Installation
8
+
9
+ Install the gem and add to the application's Gemfile by executing:
10
+
11
+ $ bundle add tiktoken_ruby
12
+
13
+ If bundler is not being used to manage dependencies, install the gem by executing:
14
+
15
+ $ gem install tiktoken_ruby
16
+
17
+ ## Usage
18
+ Usage should be very similar to the python library. Here's a simple example
19
+
20
+ Encode and decode text
21
+ ```ruby
22
+ require 'tiktoken_ruby'
23
+
24
+ enc = Tiktoken.get_encoding("cl100k_base")
25
+ enc.decode(enc.encode("hello world")) #=> "hello world"
26
+ ```
27
+
28
+ Encoders can also be retrieved by model name
29
+ ```ruby
30
+ require 'tiktoken_ruby'
31
+
32
+ enc = Tiktoken.encoding_for_model("gpt-4")
33
+ enc.encode("hello world").length #=> 2
34
+ ```
35
+
36
+ ## Development
37
+
38
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
39
+
40
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
41
+
42
+ ## Contributing
43
+
44
+ Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
45
+
46
+ To get started with development:
47
+
48
+ ```sh
49
+ git clone https://github.com/IAPark/tiktoken_ruby.git
50
+ cd tiktoken_ruby
51
+ bundle install
52
+ bundle exec rake compile
53
+ bundle exec rake spec
54
+ ```
55
+
56
+
57
+ ## License
58
+
59
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+ require "standard/rake"
6
+ require "rake/extensiontask"
7
+ require "rb_sys/extensiontask"
8
+
9
+ GEMSPEC = Gem::Specification.load("tiktoken_ruby.gemspec")
10
+
11
+ RbSys::ExtensionTask.new("tiktoken_ruby", GEMSPEC) do |ext|
12
+ ext.lib_dir = "lib/tiktoken_ruby"
13
+ end
14
+
15
+ RSpec::Core::RakeTask.new(:spec)
16
+
17
+ task :native, [:platform] do |_t, platform:|
18
+ sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
19
+ end
20
+
21
+ task build: :compile
22
+
23
+ task default: %i[compile spec standard]
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require "lib/tiktoken_ruby"
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Tiktoken::Encoding
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
11
+ end
12
+
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ end
21
+
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
26
+ def encode_ordinary(text)
27
+ @ext_base_bpe.encode_ordinary(text)
28
+ end
29
+
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
35
+ def encode(text, allowed_special: [])
36
+ @ext_base_bpe.encode(text, allowed_special)
37
+ end
38
+
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
42
+ def decode(tokens)
43
+ @ext_base_bpe.decode(tokens)
44
+ end
45
+
46
+ private
47
+
48
+ def initialize(ext_base_bpe, name)
49
+ @ext_base_bpe = ext_base_bpe
50
+ @name = name
51
+ end
52
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tiktoken
4
+ VERSION = "0.0.6"
5
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "tiktoken_ruby/version"
4
+ require_relative "tiktoken_ruby/encoding"
5
+
6
+ begin
7
+ RUBY_VERSION =~ /(\d+\.\d+)/
8
+ require_relative "tiktoken_ruby/#{$1}/tiktoken_ruby"
9
+ rescue LoadError
10
+ require_relative "tiktoken_ruby/tiktoken_ruby"
11
+ end
12
+
13
+ module Tiktoken
14
+ class << self
15
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
16
+ # it will reuse the instance of that type that was previous loaded
17
+ # @param name [Symbol|String] The name of the encoding to load
18
+ # @return [Tiktoken::Encoding] The encoding instance
19
+ # @example Encode and decode text
20
+ # enc = Tiktoken.get_encoding("cl100k_base")
21
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
22
+ def get_encoding(name)
23
+ name = name.to_sym
24
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
25
+
26
+ Tiktoken::Encoding.for_name_cached(name)
27
+ end
28
+
29
+ # Gets the encoding for an OpenAI model
30
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
31
+ # @return [Tiktoken::Encoding] The encoding instance
32
+ # @example Count tokens for text
33
+ # enc = Tiktoken.encoding_for_model("gpt-4")
34
+ # enc.encode("hello world").length #=> 2
35
+ def encoding_for_model(model_name)
36
+ PREFIX_MODELS.each do |prefix|
37
+ if model_name.to_s.start_with?("#{prefix}-")
38
+ model_name = prefix
39
+ break
40
+ end
41
+ end
42
+
43
+ encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
44
+ return nil unless encoding_name
45
+
46
+ get_encoding(encoding_name)
47
+ end
48
+
49
+ # Lists all the encodings that are supported
50
+ # @return [Array<Symbol>] The list of supported encodings
51
+ def list_encoding_names
52
+ SUPPORTED_ENCODINGS
53
+ end
54
+
55
+ # Lists all the models that are supported
56
+ # @return [Array<Symbol>] The list of supported models
57
+ def list_model_names
58
+ MODEL_TO_ENCODING_NAME.keys
59
+ end
60
+
61
+ private
62
+
63
+ SUPPORTED_ENCODINGS = [
64
+ :r50k_base,
65
+ :p50k_base,
66
+ :p50k_edit,
67
+ :cl100k_base
68
+ ]
69
+
70
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
71
+ # that is also MIT licensed but by OpenAI
72
+ MODEL_TO_ENCODING_NAME = {
73
+ "gpt-4": "cl100k_base",
74
+ "gpt-3.5-turbo": "cl100k_base",
75
+ # text
76
+ "text-davinci-003": "p50k_base",
77
+ "text-davinci-002": "p50k_base",
78
+ "text-davinci-001": "r50k_base",
79
+ "text-curie-001": "r50k_base",
80
+ "text-babbage-001": "r50k_base",
81
+ "text-ada-001": "r50k_base",
82
+ davinci: "r50k_base",
83
+ curie: "r50k_base",
84
+ babbage: "r50k_base",
85
+ ada: "r50k_base",
86
+ # code
87
+ "code-davinci-002": "p50k_base",
88
+ "code-davinci-001": "p50k_base",
89
+ "code-cushman-002": "p50k_base",
90
+ "code-cushman-001": "p50k_base",
91
+ "davinci-codex": "p50k_base",
92
+ "cushman-codex": "p50k_base",
93
+ # edit
94
+ "text-davinci-edit-001": "p50k_edit",
95
+ "code-davinci-edit-001": "p50k_edit",
96
+ # embeddings
97
+ "text-embedding-ada-002": "cl100k_base",
98
+ # old embeddings
99
+ "text-similarity-davinci-001": "r50k_base",
100
+ "text-similarity-curie-001": "r50k_base",
101
+ "text-similarity-babbage-001": "r50k_base",
102
+ "text-similarity-ada-001": "r50k_base",
103
+ "text-search-davinci-doc-001": "r50k_base",
104
+ "text-search-curie-doc-001": "r50k_base",
105
+ "text-search-babbage-doc-001": "r50k_base",
106
+ "text-search-ada-doc-001": "r50k_base",
107
+ "code-search-babbage-code-001": "r50k_base",
108
+ "code-search-ada-code-001": "r50k_base"
109
+ }
110
+
111
+ # these are models that have a versioned models that are otherwise identical
112
+ PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
113
+ end
114
+ end
@@ -0,0 +1,4 @@
1
+ module TiktokenRuby
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rz_tiktoken_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.6
5
+ platform: x86_64-linux
6
+ authors:
7
+ - IAPark
8
+ - judy
9
+ autorequire:
10
+ bindir: exe
11
+ cert_chain: []
12
+ date: 2023-08-24 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: |
15
+ An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and used by OpenAI. It
16
+ can be used to count the number of tokens in text before sending it to OpenAI APIs. This
17
+ is a fork of tiktoken_ruby by IAPark, which has been cross-compiled for multiple platforms.
18
+ This way compilation with Rust extensions doesn't need to happen wherever you are deploying it.
19
+ email:
20
+ - isaac.a.park@gmail.com
21
+ - clinton@judy.io
22
+ executables: []
23
+ extensions: []
24
+ extra_rdoc_files: []
25
+ files:
26
+ - ".rspec"
27
+ - ".standard.yml"
28
+ - Gemfile
29
+ - Gemfile.lock
30
+ - LICENSE.txt
31
+ - README.md
32
+ - Rakefile
33
+ - doctest_helper.rb
34
+ - lib/tiktoken_ruby.rb
35
+ - lib/tiktoken_ruby/3.0/tiktoken_ruby.so
36
+ - lib/tiktoken_ruby/3.1/tiktoken_ruby.so
37
+ - lib/tiktoken_ruby/3.2/tiktoken_ruby.so
38
+ - lib/tiktoken_ruby/encoding.rb
39
+ - lib/tiktoken_ruby/version.rb
40
+ - sig/tiktoken_ruby.rbs
41
+ homepage: https://github.com/retailzipline/tiktoken_ruby
42
+ licenses:
43
+ - MIT
44
+ metadata:
45
+ homepage_uri: https://github.com/retailzipline/tiktoken_ruby
46
+ source_code_uri: https://github.com/retailzipline/tiktoken_ruby
47
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
48
+ allowed_push_host: https://rubygems.org
49
+ post_install_message:
50
+ rdoc_options: []
51
+ require_paths:
52
+ - lib
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '3.0'
58
+ - - "<"
59
+ - !ruby/object:Gem::Version
60
+ version: 3.3.dev
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: 3.1.0
66
+ requirements: []
67
+ rubygems_version: 3.4.4
68
+ signing_key:
69
+ specification_version: 4
70
+ summary: Ruby wrapper for Tiktoken
71
+ test_files: []