tiktoken_ruby 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 417d96c1d3bb9bac547fb3cc632e061894e13e7d7673a74e08ddc33d33bc3cbb
4
- data.tar.gz: b27d96b688bc4fa5f31cdecfaa629306e262bd34960a25bda93e7b8364e69c57
3
+ metadata.gz: 1c461629e4b1174e825e05f4644e7bcdaac90849047997c6724358435c70f0bb
4
+ data.tar.gz: 244217e1b298a998c468723d2082bcf10bb46e6a80168d808a56ee4b618bbf6f
5
5
  SHA512:
6
- metadata.gz: 03cdce8758407d412fe29e580ed5caed455d8153923c982b60ed0440f3537f73034c121fa758e01b86354368443ff2e1f2ed4b255833ccf385d99ff79f337058
7
- data.tar.gz: 55b2e0db635609d0f756abef80e761b828eedc345b442b620c086f3e07072895b094a115993d4083633f34511e5a74ff38a2f0187a6308b28abd6f9d5483c016
6
+ metadata.gz: e2920fba51441a33440435c310e2389723ed5e63dd150e0303743e09ee2ef5f68499e14f9df6254daa7a355568fbdf54e91a9ef3eebba30182f757e7f190ba11
7
+ data.tar.gz: b0da8ae135800a6405e546a03f1c17319978096716872a3d4b2fbe7fe812f2061f73d02eb8540f866d41a009d21c075273e096c3073be6ae1b5562f799f806c1
data/Gemfile CHANGED
@@ -13,4 +13,5 @@ gem "rb_sys"
13
13
  gem "rspec", "~> 3.0"
14
14
 
15
15
  gem "standard", "~> 1.3"
16
- gem 'pry', '~> 0.14.2'
16
+
17
+ gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,23 +1,19 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.2)
4
+ tiktoken_ruby (0.0.4)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  ast (2.4.2)
10
- coderay (1.1.3)
11
10
  diff-lcs (1.5.0)
12
11
  json (2.6.3)
13
12
  language_server-protocol (3.17.0.3)
14
- method_source (1.0.0)
13
+ minitest (5.18.0)
15
14
  parallel (1.22.1)
16
15
  parser (3.2.1.1)
17
16
  ast (~> 2.4.1)
18
- pry (0.14.2)
19
- coderay (~> 1.1)
20
- method_source (~> 1.0)
21
17
  rainbow (3.1.1)
22
18
  rake (13.0.6)
23
19
  rake-compiler (1.2.1)
@@ -59,18 +55,26 @@ GEM
59
55
  rubocop (= 1.48.1)
60
56
  rubocop-performance (= 1.16.0)
61
57
  unicode-display_width (2.4.2)
58
+ webrick (1.7.0)
59
+ yard (0.9.28)
60
+ webrick (~> 1.7.0)
61
+ yard-doctest (0.1.17)
62
+ minitest
63
+ yard
62
64
 
63
65
  PLATFORMS
64
66
  arm64-darwin-22
67
+ x86_64-darwin-22
68
+ x86_64-linux
65
69
 
66
70
  DEPENDENCIES
67
- pry (~> 0.14.2)
68
71
  rake (~> 13.0)
69
72
  rake-compiler
70
73
  rb_sys
71
74
  rspec (~> 3.0)
72
75
  standard (~> 1.3)
73
76
  tiktoken_ruby!
77
+ yard-doctest (~> 0.1.17)
74
78
 
75
79
  BUNDLED WITH
76
80
  2.4.6
data/README.md CHANGED
@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
+ Usage should be very similar to the python library. Here's a simple example
18
19
 
20
+ Encode and decode text
19
21
  ```ruby
20
- encoding = Tiktoken::Encoding.r50k_base
21
- tokens = encoding.encode("Hello world!")
22
- puts encoding.decode(tokens)
22
+ require 'tiktoken_ruby'
23
+
24
+ enc = Tiktoken.get_encoding("cl100k_base")
25
+ enc.decode(enc.encode("hello world")) #=> "hello world"
26
+ ```
27
+
28
+ Encoders can also be retrieved by model name
29
+ ```ruby
30
+ require 'tiktoken_ruby'
31
+
32
+ enc = Tiktoken.encoding_for_model("gpt-4")
33
+ enc.encode("hello world").length #=> 2
23
34
  ```
24
35
 
25
36
  ## Development
@@ -32,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
32
43
 
33
44
  Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
34
45
 
46
+ To get started with development:
47
+
48
+ ```sh
49
+ git clone https://github.com/IAPark/tiktoken_ruby.git
50
+ cd tiktoken_ruby
51
+ bundle install
52
+ bundle exec rake compile
53
+ bundle exec rake spec
54
+ ```
55
+
56
+
35
57
  ## License
36
58
 
37
59
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -14,7 +14,6 @@ end
14
14
 
15
15
  RSpec::Core::RakeTask.new(:spec)
16
16
 
17
-
18
17
  task :native, [:platform] do |_t, platform:|
19
18
  sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
20
19
  end
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require "lib/tiktoken_ruby"
@@ -35,7 +35,7 @@ fn init() -> Result<(), Error> {
35
35
  let factory_module = module.define_module("BpeFactory")?;
36
36
  factory_module.define_singleton_method("r50k_base", function!(r50k_base, 0))?;
37
37
  factory_module.define_singleton_method("p50k_base", function!(p50k_base, 0))?;
38
- factory_module.define_singleton_method("p50k_base", function!(p50k_edit, 0))?;
38
+ factory_module.define_singleton_method("p50k_edit", function!(p50k_edit, 0))?;
39
39
  factory_module.define_singleton_method("cl100k_base", function!(cl100k_base, 0))?;
40
40
 
41
41
 
@@ -1,23 +1,52 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Tiktoken::Encoding
4
- def self.method_missing(method)
5
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(method))
6
- end
7
-
8
- def initialize(ext_base_bpe)
9
- @ext_base_bpe = ext_base_bpe
10
- end
11
-
12
- def encode_ordinary(text)
13
- @ext_base_bpe.encode_ordinary(text)
14
- end
15
-
16
- def encode(text, allowed_special: [])
17
- @ext_base_bpe.encode(text, allowed_special)
18
- end
19
-
20
- def decode(tokens)
21
- @ext_base_bpe.decode(tokens)
22
- end
3
+ class Tiktoken::Encoding
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
11
+ end
12
+
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ end
21
+
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
26
+ def encode_ordinary(text)
27
+ @ext_base_bpe.encode_ordinary(text)
28
+ end
29
+
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
35
+ def encode(text, allowed_special: [])
36
+ @ext_base_bpe.encode(text, allowed_special)
37
+ end
38
+
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
42
+ def decode(tokens)
43
+ @ext_base_bpe.decode(tokens)
44
+ end
45
+
46
+ private
47
+
48
+ def initialize(ext_base_bpe, name)
49
+ @ext_base_bpe = ext_base_bpe
50
+ @name = name
51
+ end
23
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.2"
4
+ VERSION = "0.0.4"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "tiktoken_ruby/version"
4
- require_relative "tiktoken_ruby/encoding.rb"
4
+ require_relative "tiktoken_ruby/encoding"
5
5
 
6
6
  begin
7
7
  RUBY_VERSION =~ /(\d+\.\d+)/
@@ -11,5 +11,104 @@ rescue LoadError
11
11
  end
12
12
 
13
13
  module Tiktoken
14
- class Error < StandardError; end
14
+ class << self
15
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
16
+ # it will reuse the instance of that type that was previous loaded
17
+ # @param name [Symbol|String] The name of the encoding to load
18
+ # @return [Tiktoken::Encoding] The encoding instance
19
+ # @example Encode and decode text
20
+ # enc = Tiktoken.get_encoding("cl100k_base")
21
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
22
+ def get_encoding(name)
23
+ name = name.to_sym
24
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
25
+
26
+ Tiktoken::Encoding.for_name_cached(name)
27
+ end
28
+
29
+ # Gets the encoding for an OpenAI model
30
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
31
+ # @return [Tiktoken::Encoding] The encoding instance
32
+ # @example Count tokens for text
33
+ # enc = Tiktoken.encoding_for_model("gpt-4")
34
+ # enc.encode("hello world").length #=> 2
35
+ def encoding_for_model(model_name)
36
+ PREFIX_MODELS.each do |prefix|
37
+ if model_name.to_s.start_with?("#{prefix}-")
38
+ model_name = prefix
39
+ break
40
+ end
41
+ end
42
+
43
+ encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
44
+ return nil unless encoding_name
45
+
46
+ get_encoding(encoding_name)
47
+ end
48
+
49
+ # Lists all the encodings that are supported
50
+ # @return [Array<Symbol>] The list of supported encodings
51
+ def list_encoding_names
52
+ SUPPORTED_ENCODINGS
53
+ end
54
+
55
+ # Lists all the models that are supported
56
+ # @return [Array<Symbol>] The list of supported models
57
+ def list_model_names
58
+ MODEL_TO_ENCODING_NAME.keys
59
+ end
60
+
61
+ private
62
+
63
+ SUPPORTED_ENCODINGS = [
64
+ :r50k_base,
65
+ :p50k_base,
66
+ :p50k_edit,
67
+ :cl100k_base
68
+ ]
69
+
70
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
71
+ # that is also MIT licensed but by OpenAI
72
+ MODEL_TO_ENCODING_NAME = {
73
+ "gpt-4": "cl100k_base",
74
+ "gpt-3.5-turbo": "cl100k_base",
75
+ # text
76
+ "text-davinci-003": "p50k_base",
77
+ "text-davinci-002": "p50k_base",
78
+ "text-davinci-001": "r50k_base",
79
+ "text-curie-001": "r50k_base",
80
+ "text-babbage-001": "r50k_base",
81
+ "text-ada-001": "r50k_base",
82
+ davinci: "r50k_base",
83
+ curie: "r50k_base",
84
+ babbage: "r50k_base",
85
+ ada: "r50k_base",
86
+ # code
87
+ "code-davinci-002": "p50k_base",
88
+ "code-davinci-001": "p50k_base",
89
+ "code-cushman-002": "p50k_base",
90
+ "code-cushman-001": "p50k_base",
91
+ "davinci-codex": "p50k_base",
92
+ "cushman-codex": "p50k_base",
93
+ # edit
94
+ "text-davinci-edit-001": "p50k_edit",
95
+ "code-davinci-edit-001": "p50k_edit",
96
+ # embeddings
97
+ "text-embedding-ada-002": "cl100k_base",
98
+ # old embeddings
99
+ "text-similarity-davinci-001": "r50k_base",
100
+ "text-similarity-curie-001": "r50k_base",
101
+ "text-similarity-babbage-001": "r50k_base",
102
+ "text-similarity-ada-001": "r50k_base",
103
+ "text-search-davinci-doc-001": "r50k_base",
104
+ "text-search-curie-doc-001": "r50k_base",
105
+ "text-search-babbage-doc-001": "r50k_base",
106
+ "text-search-ada-doc-001": "r50k_base",
107
+ "code-search-babbage-code-001": "r50k_base",
108
+ "code-search-ada-code-001": "r50k_base"
109
+ }
110
+
111
+ # these are models that have a versioned models that are otherwise identical
112
+ PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
113
+ end
15
114
  end
@@ -9,18 +9,21 @@ Gem::Specification.new do |spec|
9
9
  spec.email = ["isaac.a.park@gmail.com"]
10
10
 
11
11
  spec.summary = "Ruby wrapper for Tiktoken"
12
- spec.description = "Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings"
12
+ spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
13
+ "a BPE tokenizer written by and used by OpenAI. It can be used to " \
14
+ "count the number of tokens in text before sending it to OpenAI APIs."
15
+
13
16
  spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
14
17
  spec.license = "MIT"
15
- spec.required_ruby_version = ">= 2.6.0"
18
+ spec.required_ruby_version = ">= 2.7.0"
16
19
  spec.required_rubygems_version = ">= 3.1.0"
17
20
  spec.platform = Gem::Platform::RUBY
18
21
 
19
- #spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
20
-
21
22
  spec.metadata["homepage_uri"] = spec.homepage
22
23
  spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
23
- #spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
24
+ spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
25
+
26
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
24
27
 
25
28
  # Specify which files should be added to the gem when it is released.
26
29
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
@@ -32,11 +35,7 @@ Gem::Specification.new do |spec|
32
35
  spec.bindir = "exe"
33
36
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
34
37
  spec.require_paths = ["lib"]
35
- spec.extensions = ["ext/tiktoken_ruby/Cargo.toml"]
36
-
37
- # Uncomment to register a new dependency of your gem
38
- # spec.add_dependency "example-gem", "~> 1.0"
39
- # spec.add_dependency "rb_sys", "~> 0.9"
38
+ spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
40
39
 
41
40
  # For more information and examples about making a new gem, check out our
42
41
  # guide at: https://bundler.io/guides/creating_gem.html
metadata CHANGED
@@ -1,21 +1,23 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-19 00:00:00.000000000 Z
11
+ date: 2023-03-28 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings
13
+ description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
+ used by OpenAI. It can be used to count the number of tokens in text before sending
15
+ it to OpenAI APIs.
14
16
  email:
15
17
  - isaac.a.park@gmail.com
16
18
  executables: []
17
19
  extensions:
18
- - ext/tiktoken_ruby/Cargo.toml
20
+ - ext/tiktoken_ruby/extconf.rb
19
21
  extra_rdoc_files: []
20
22
  files:
21
23
  - ".rspec"
@@ -27,6 +29,7 @@ files:
27
29
  - LICENSE.txt
28
30
  - README.md
29
31
  - Rakefile
32
+ - doctest_helper.rb
30
33
  - ext/tiktoken_ruby/Cargo.toml
31
34
  - ext/tiktoken_ruby/extconf.rb
32
35
  - ext/tiktoken_ruby/src/core_bpe_wrapper.rs
@@ -42,6 +45,7 @@ licenses:
42
45
  metadata:
43
46
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
44
47
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
48
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
45
49
  post_install_message:
46
50
  rdoc_options: []
47
51
  require_paths:
@@ -50,7 +54,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
50
54
  requirements:
51
55
  - - ">="
52
56
  - !ruby/object:Gem::Version
53
- version: 2.6.0
57
+ version: 2.7.0
54
58
  required_rubygems_version: !ruby/object:Gem::Requirement
55
59
  requirements:
56
60
  - - ">="