tiktoken_ruby 0.0.2-x64-mingw32 → 0.0.4-x64-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1f201c1f241950d98f5fb3f73eb4a6713639748686b7aa5e1bbf291a1c17409
4
- data.tar.gz: 4a467c7b2e50cbe85d4b9fbb6a0a51050f5bdf0eea4f61e903448b1c483890de
3
+ metadata.gz: d23060d6a5e854ef85dfe077241d8e107bd190d72c53730a2baa6c1c16ad265b
4
+ data.tar.gz: 552377dfabaf29d5ecc187eafb1275199aaa69424e9920b2944e056d58821578
5
5
  SHA512:
6
- metadata.gz: 15dfa53dd93a6c68119d98b3d09cbf6d57b96abf6bff3d473b28155e3d22ee4b94b419fab9fd3c5bfb1d0b0baf880ca99844de94aa82c6e6d22f28d5ea961076
7
- data.tar.gz: c4c56bd6de0210ee0a23237c57bafd0f2c141a0a7001d203b4d1a69014c0a325b69c87a16837d8c49ecaab4dcbfaded65086b905df701a93a21047d0d736a331
6
+ metadata.gz: c44bc1c3df4c4df0777a16a4dbf2e16d052ab2363f07a1490c5b28f0e164e18ec537859134a4b703c0451e359db3cfde38a89fbe4fbadbe71327885d719573e0
7
+ data.tar.gz: 855350295f5499e18369bfcfd7bcccb960d89214c3ba7bf127b18cd9c1e289ba20022b0d980f7be523392cbe0079aa2d245effee3cac98f306f765e24c0c6d84
data/Gemfile CHANGED
@@ -13,4 +13,5 @@ gem "rb_sys"
13
13
  gem "rspec", "~> 3.0"
14
14
 
15
15
  gem "standard", "~> 1.3"
16
- gem 'pry', '~> 0.14.2'
16
+
17
+ gem "yard-doctest", "~> 0.1.17"
data/Gemfile.lock CHANGED
@@ -1,23 +1,19 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.2)
4
+ tiktoken_ruby (0.0.4)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  ast (2.4.2)
10
- coderay (1.1.3)
11
10
  diff-lcs (1.5.0)
12
11
  json (2.6.3)
13
12
  language_server-protocol (3.17.0.3)
14
- method_source (1.0.0)
13
+ minitest (5.18.0)
15
14
  parallel (1.22.1)
16
15
  parser (3.2.1.1)
17
16
  ast (~> 2.4.1)
18
- pry (0.14.2)
19
- coderay (~> 1.1)
20
- method_source (~> 1.0)
21
17
  rainbow (3.1.1)
22
18
  rake (13.0.6)
23
19
  rake-compiler (1.2.1)
@@ -59,19 +55,26 @@ GEM
59
55
  rubocop (= 1.48.1)
60
56
  rubocop-performance (= 1.16.0)
61
57
  unicode-display_width (2.4.2)
58
+ webrick (1.7.0)
59
+ yard (0.9.28)
60
+ webrick (~> 1.7.0)
61
+ yard-doctest (0.1.17)
62
+ minitest
63
+ yard
62
64
 
63
65
  PLATFORMS
64
66
  arm64-darwin-22
67
+ x86_64-darwin-22
65
68
  x86_64-linux
66
69
 
67
70
  DEPENDENCIES
68
- pry (~> 0.14.2)
69
71
  rake (~> 13.0)
70
72
  rake-compiler
71
73
  rb_sys
72
74
  rspec (~> 3.0)
73
75
  standard (~> 1.3)
74
76
  tiktoken_ruby!
77
+ yard-doctest (~> 0.1.17)
75
78
 
76
79
  BUNDLED WITH
77
80
  2.4.6
data/README.md CHANGED
@@ -15,11 +15,22 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
15
  $ gem install tiktoken_ruby
16
16
 
17
17
  ## Usage
18
+ Usage should be very similar to the python library. Here's a simple example
18
19
 
20
+ Encode and decode text
19
21
  ```ruby
20
- encoding = Tiktoken::Encoding.r50k_base
21
- tokens = encoding.encode("Hello world!")
22
- puts encoding.decode(tokens)
22
+ require 'tiktoken_ruby'
23
+
24
+ enc = Tiktoken.get_encoding("cl100k_base")
25
+ enc.decode(enc.encode("hello world")) #=> "hello world"
26
+ ```
27
+
28
+ Encoders can also be retrieved by model name
29
+ ```ruby
30
+ require 'tiktoken_ruby'
31
+
32
+ enc = Tiktoken.encoding_for_model("gpt-4")
33
+ enc.encode("hello world").length #=> 2
23
34
  ```
24
35
 
25
36
  ## Development
@@ -32,6 +43,17 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
32
43
 
33
44
  Bug reports and pull requests are welcome on GitHub at https://github.com/iapark/tiktoken_ruby.
34
45
 
46
+ To get started with development:
47
+
48
+ ```sh
49
+ git clone https://github.com/IAPark/tiktoken_ruby.git
50
+ cd tiktoken_ruby
51
+ bundle install
52
+ bundle exec rake compile
53
+ bundle exec rake spec
54
+ ```
55
+
56
+
35
57
  ## License
36
58
 
37
59
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -14,7 +14,6 @@ end
14
14
 
15
15
  RSpec::Core::RakeTask.new(:spec)
16
16
 
17
-
18
17
  task :native, [:platform] do |_t, platform:|
19
18
  sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build"
20
19
  end
data/doctest_helper.rb ADDED
@@ -0,0 +1 @@
1
+ require "lib/tiktoken_ruby"
Binary file
Binary file
@@ -1,23 +1,52 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class Tiktoken::Encoding
4
- def self.method_missing(method)
5
- Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(method))
6
- end
7
-
8
- def initialize(ext_base_bpe)
9
- @ext_base_bpe = ext_base_bpe
10
- end
11
-
12
- def encode_ordinary(text)
13
- @ext_base_bpe.encode_ordinary(text)
14
- end
15
-
16
- def encode(text, allowed_special: [])
17
- @ext_base_bpe.encode(text, allowed_special)
18
- end
19
-
20
- def decode(tokens)
21
- @ext_base_bpe.decode(tokens)
22
- end
3
+ class Tiktoken::Encoding
4
+ attr_reader :name
5
+
6
+ # This returns a new Tiktoken::Encoding instance for the requested encoding
7
+ # @param encoding [Symbol] The name of the encoding to load
8
+ # @return [Tiktoken::Encoding] The encoding instance
9
+ def self.for_name(encoding)
10
+ Tiktoken::Encoding.new(Tiktoken::BpeFactory.send(encoding.to_sym), encoding.to_sym)
11
+ end
12
+
13
+ # This returns a Tiktoken::Encoding instance for the requested encoding
14
+ # It will reuse an existing encoding if it's already been loaded
15
+ # @param encoding [Symbol] The name of the encoding to load
16
+ # @return [Tiktoken::Encoding] The encoding instance
17
+ def self.for_name_cached(encoding)
18
+ @encodings ||= {}
19
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ end
21
+
22
+ # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
23
+ # basically it's unescaped
24
+ # @param text [String] The text to encode
25
+ # @return [Array<Integer>] The encoded tokens
26
+ def encode_ordinary(text)
27
+ @ext_base_bpe.encode_ordinary(text)
28
+ end
29
+
30
+ # Encodes the text as a list of integer tokens. This encoding will treat special non text tokens
31
+ # as text unless they're in the allowed_special array. It's basically like the text was escaped
32
+ # @param text [String] The text to encode
33
+ # @param allowed_special [Array<String>] An array of special tokens to allow
34
+ # @return [Array<Integer>] The encoded tokens
35
+ def encode(text, allowed_special: [])
36
+ @ext_base_bpe.encode(text, allowed_special)
37
+ end
38
+
39
+ # Decodes the tokens back into text
40
+ # @param tokens [Array<Integer>] The tokens to decode
41
+ # @return [String] The decoded text
42
+ def decode(tokens)
43
+ @ext_base_bpe.decode(tokens)
44
+ end
45
+
46
+ private
47
+
48
+ def initialize(ext_base_bpe, name)
49
+ @ext_base_bpe = ext_base_bpe
50
+ @name = name
51
+ end
23
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.2"
4
+ VERSION = "0.0.4"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "tiktoken_ruby/version"
4
- require_relative "tiktoken_ruby/encoding.rb"
4
+ require_relative "tiktoken_ruby/encoding"
5
5
 
6
6
  begin
7
7
  RUBY_VERSION =~ /(\d+\.\d+)/
@@ -11,5 +11,104 @@ rescue LoadError
11
11
  end
12
12
 
13
13
  module Tiktoken
14
- class Error < StandardError; end
14
+ class << self
15
+ # Returns an encoding by name. If the encoding is not already loaded it will be loaded, but otherwise
16
+ # it will reuse the instance of that type that was previous loaded
17
+ # @param name [Symbol|String] The name of the encoding to load
18
+ # @return [Tiktoken::Encoding] The encoding instance
19
+ # @example Encode and decode text
20
+ # enc = Tiktoken.get_encoding("cl100k_base")
21
+ # enc.decode(enc.encode("hello world")) #=> "hello world"
22
+ def get_encoding(name)
23
+ name = name.to_sym
24
+ return nil unless SUPPORTED_ENCODINGS.include?(name)
25
+
26
+ Tiktoken::Encoding.for_name_cached(name)
27
+ end
28
+
29
+ # Gets the encoding for an OpenAI model
30
+ # @param model_name [Symbol|String] The name of the model to get the encoding for
31
+ # @return [Tiktoken::Encoding] The encoding instance
32
+ # @example Count tokens for text
33
+ # enc = Tiktoken.encoding_for_model("gpt-4")
34
+ # enc.encode("hello world").length #=> 2
35
+ def encoding_for_model(model_name)
36
+ PREFIX_MODELS.each do |prefix|
37
+ if model_name.to_s.start_with?("#{prefix}-")
38
+ model_name = prefix
39
+ break
40
+ end
41
+ end
42
+
43
+ encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
44
+ return nil unless encoding_name
45
+
46
+ get_encoding(encoding_name)
47
+ end
48
+
49
+ # Lists all the encodings that are supported
50
+ # @return [Array<Symbol>] The list of supported encodings
51
+ def list_encoding_names
52
+ SUPPORTED_ENCODINGS
53
+ end
54
+
55
+ # Lists all the models that are supported
56
+ # @return [Array<Symbol>] The list of supported models
57
+ def list_model_names
58
+ MODEL_TO_ENCODING_NAME.keys
59
+ end
60
+
61
+ private
62
+
63
+ SUPPORTED_ENCODINGS = [
64
+ :r50k_base,
65
+ :p50k_base,
66
+ :p50k_edit,
67
+ :cl100k_base
68
+ ]
69
+
70
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
71
+ # that is also MIT licensed but by OpenAI
72
+ MODEL_TO_ENCODING_NAME = {
73
+ "gpt-4": "cl100k_base",
74
+ "gpt-3.5-turbo": "cl100k_base",
75
+ # text
76
+ "text-davinci-003": "p50k_base",
77
+ "text-davinci-002": "p50k_base",
78
+ "text-davinci-001": "r50k_base",
79
+ "text-curie-001": "r50k_base",
80
+ "text-babbage-001": "r50k_base",
81
+ "text-ada-001": "r50k_base",
82
+ davinci: "r50k_base",
83
+ curie: "r50k_base",
84
+ babbage: "r50k_base",
85
+ ada: "r50k_base",
86
+ # code
87
+ "code-davinci-002": "p50k_base",
88
+ "code-davinci-001": "p50k_base",
89
+ "code-cushman-002": "p50k_base",
90
+ "code-cushman-001": "p50k_base",
91
+ "davinci-codex": "p50k_base",
92
+ "cushman-codex": "p50k_base",
93
+ # edit
94
+ "text-davinci-edit-001": "p50k_edit",
95
+ "code-davinci-edit-001": "p50k_edit",
96
+ # embeddings
97
+ "text-embedding-ada-002": "cl100k_base",
98
+ # old embeddings
99
+ "text-similarity-davinci-001": "r50k_base",
100
+ "text-similarity-curie-001": "r50k_base",
101
+ "text-similarity-babbage-001": "r50k_base",
102
+ "text-similarity-ada-001": "r50k_base",
103
+ "text-search-davinci-doc-001": "r50k_base",
104
+ "text-search-curie-doc-001": "r50k_base",
105
+ "text-search-babbage-doc-001": "r50k_base",
106
+ "text-search-ada-doc-001": "r50k_base",
107
+ "code-search-babbage-code-001": "r50k_base",
108
+ "code-search-ada-code-001": "r50k_base"
109
+ }
110
+
111
+ # these are models that have a versioned models that are otherwise identical
112
+ PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
113
+ end
15
114
  end
metadata CHANGED
@@ -1,16 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: x64-mingw32
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-03-19 00:00:00.000000000 Z
11
+ date: 2023-03-28 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Unofficial Ruby wrapper for Tiktoken by way of the unofficial rust bindings
13
+ description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
14
+ used by OpenAI. It can be used to count the number of tokens in text before sending
15
+ it to OpenAI APIs.
14
16
  email:
15
17
  - isaac.a.park@gmail.com
16
18
  executables: []
@@ -24,6 +26,7 @@ files:
24
26
  - LICENSE.txt
25
27
  - README.md
26
28
  - Rakefile
29
+ - doctest_helper.rb
27
30
  - lib/tiktoken_ruby.rb
28
31
  - lib/tiktoken_ruby/2.7/tiktoken_ruby.so
29
32
  - lib/tiktoken_ruby/3.0/tiktoken_ruby.so
@@ -36,6 +39,7 @@ licenses:
36
39
  metadata:
37
40
  homepage_uri: https://github.com/IAPark/tiktoken_ruby
38
41
  source_code_uri: https://github.com/IAPark/tiktoken_ruby
42
+ documentation_uri: https://rubydoc.info/github/IAPark/tiktoken_ruby/main
39
43
  post_install_message:
40
44
  rdoc_options: []
41
45
  require_paths: