tiktoken_ruby 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e011f7a82a6ee3d09405cd2c817d2ba9fe06e5a3bdde3ee622c9aefd9b1e8dd8
4
- data.tar.gz: 892539805e10e5e10456956aa65b72e7ab4ececae56db2c49ccd11848cd0d2c2
3
+ metadata.gz: 347a10d045e27fca4cdfec03c4d2eac0150448b8f2125d5bcbcd1b92db83499a
4
+ data.tar.gz: de048e8320daa15b27ffa7ccdd9f7ec618cb2a3ad96fe2bedf71e6f780fc8b6f
5
5
  SHA512:
6
- metadata.gz: 5accbfe97904fb1e9b6c5fa793a9e326bb25befaca6dbf5a2f8e6cfcad5f3c80c7dd1d0f66aaa95bed732943a2c0999654ffa2cf40891b2c9443f8909efb1dc6
7
- data.tar.gz: 5e95a59d2876fb54e8a76e7d855c42d4e314be211f495dae5ef0bf4004cc44eeb0ea433a620e037232349a10d107ed25afafa704e1fffcddb4ce29219a199e87
6
+ metadata.gz: 7c587d4b18e777a0f7692aab857fec1fc3b3bcceceab158439197123786475c65d51f5356351bcf0c9c327b94e2ff15a83b2144c2db329aedea0456c1d763ff9
7
+ data.tar.gz: abfdeb836d81555effa5ef6647a77b360dae2b975bb3dc23a22c779f6dd82256630942ddfef01a837ff5195896fa4251925e87dec0d245dce1c8e83719a488f1
data/Cargo.lock CHANGED
@@ -31,16 +31,16 @@ checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
31
31
 
32
32
  [[package]]
33
33
  name = "bindgen"
34
- version = "0.66.1"
34
+ version = "0.69.4"
35
35
  source = "registry+https://github.com/rust-lang/crates.io-index"
36
- checksum = "f2b84e06fc203107bfbad243f4aba2af864eb7db3b1cf46ea0a023b0b433d2a7"
36
+ checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
37
37
  dependencies = [
38
38
  "bitflags 2.4.0",
39
39
  "cexpr",
40
40
  "clang-sys",
41
+ "itertools",
41
42
  "lazy_static",
42
43
  "lazycell",
43
- "peeking_take_while",
44
44
  "proc-macro2",
45
45
  "quote",
46
46
  "regex",
@@ -114,6 +114,12 @@ dependencies = [
114
114
  "libloading",
115
115
  ]
116
116
 
117
+ [[package]]
118
+ name = "either"
119
+ version = "1.10.0"
120
+ source = "registry+https://github.com/rust-lang/crates.io-index"
121
+ checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
122
+
117
123
  [[package]]
118
124
  name = "fancy-regex"
119
125
  version = "0.11.0"
@@ -130,6 +136,15 @@ version = "0.3.1"
130
136
  source = "registry+https://github.com/rust-lang/crates.io-index"
131
137
  checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
132
138
 
139
+ [[package]]
140
+ name = "itertools"
141
+ version = "0.12.1"
142
+ source = "registry+https://github.com/rust-lang/crates.io-index"
143
+ checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
144
+ dependencies = [
145
+ "either",
146
+ ]
147
+
133
148
  [[package]]
134
149
  name = "lazy_static"
135
150
  version = "1.4.0"
@@ -242,12 +257,6 @@ dependencies = [
242
257
  "windows-sys",
243
258
  ]
244
259
 
245
- [[package]]
246
- name = "peeking_take_while"
247
- version = "0.1.2"
248
- source = "registry+https://github.com/rust-lang/crates.io-index"
249
- checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
250
-
251
260
  [[package]]
252
261
  name = "proc-macro2"
253
262
  version = "1.0.66"
@@ -268,18 +277,18 @@ dependencies = [
268
277
 
269
278
  [[package]]
270
279
  name = "rb-sys"
271
- version = "0.9.81"
280
+ version = "0.9.87"
272
281
  source = "registry+https://github.com/rust-lang/crates.io-index"
273
- checksum = "a57240b308b155b09dce81e32829966a99f52d1088b45957e4283e526c5317a1"
282
+ checksum = "225103e3d69bbfe8831f9fd0d2461335f3a9dd06aa6e88bcb6d6970383494d06"
274
283
  dependencies = [
275
284
  "rb-sys-build",
276
285
  ]
277
286
 
278
287
  [[package]]
279
288
  name = "rb-sys-build"
280
- version = "0.9.81"
289
+ version = "0.9.87"
281
290
  source = "registry+https://github.com/rust-lang/crates.io-index"
282
- checksum = "f24ce877a4c5d07f06f6aa6fec3ac95e4b357b9f73b0f5445d8cbb7266d410e8"
291
+ checksum = "bacce8095a5167d5ede618bbd9353e9d9e2f32ddaf54be911106f0ee6baacf09"
283
292
  dependencies = [
284
293
  "bindgen",
285
294
  "lazy_static",
data/Gemfile CHANGED
@@ -2,15 +2,11 @@
2
2
 
3
3
  source "https://rubygems.org"
4
4
 
5
- # Specify your gem's dependencies in tiktoken_ruby.gemspec
6
5
  gemspec
7
6
 
8
- gem "rake", "~> 13.0"
9
-
7
+ gem "rake"
10
8
  gem "rake-compiler"
11
-
12
- gem "rspec", "~> 3.0"
13
-
14
- gem "standard", "~> 1.3"
15
-
16
- gem "yard-doctest", "~> 0.1.17"
9
+ gem "rspec"
10
+ gem "standard"
11
+ gem "yard-doctest"
12
+ gem "racc"
data/Gemfile.lock CHANGED
@@ -1,64 +1,74 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- tiktoken_ruby (0.0.6)
5
- rb_sys (~> 0.9.68)
4
+ tiktoken_ruby (0.0.8)
5
+ rb_sys (>= 0.9.86)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
10
  ast (2.4.2)
11
11
  diff-lcs (1.5.0)
12
- json (2.6.3)
12
+ json (2.7.1)
13
13
  language_server-protocol (3.17.0.3)
14
- minitest (5.18.0)
15
- parallel (1.22.1)
16
- parser (3.2.1.1)
14
+ lint_roller (1.1.0)
15
+ minitest (5.21.2)
16
+ parallel (1.24.0)
17
+ parser (3.3.0.4)
17
18
  ast (~> 2.4.1)
19
+ racc
20
+ racc (1.7.3)
18
21
  rainbow (3.1.1)
19
- rake (13.0.6)
20
- rake-compiler (1.2.1)
22
+ rake (13.1.0)
23
+ rake-compiler (1.2.5)
21
24
  rake
22
- rb_sys (0.9.68)
23
- regexp_parser (2.7.0)
24
- rexml (3.2.5)
25
+ rb_sys (0.9.86)
26
+ regexp_parser (2.9.0)
27
+ rexml (3.2.6)
25
28
  rspec (3.12.0)
26
29
  rspec-core (~> 3.12.0)
27
30
  rspec-expectations (~> 3.12.0)
28
31
  rspec-mocks (~> 3.12.0)
29
- rspec-core (3.12.1)
32
+ rspec-core (3.12.2)
30
33
  rspec-support (~> 3.12.0)
31
- rspec-expectations (3.12.2)
34
+ rspec-expectations (3.12.3)
32
35
  diff-lcs (>= 1.2.0, < 2.0)
33
36
  rspec-support (~> 3.12.0)
34
- rspec-mocks (3.12.4)
37
+ rspec-mocks (3.12.6)
35
38
  diff-lcs (>= 1.2.0, < 2.0)
36
39
  rspec-support (~> 3.12.0)
37
- rspec-support (3.12.0)
38
- rubocop (1.48.1)
40
+ rspec-support (3.12.1)
41
+ rubocop (1.59.0)
39
42
  json (~> 2.3)
43
+ language_server-protocol (>= 3.17.0)
40
44
  parallel (~> 1.10)
41
- parser (>= 3.2.0.0)
45
+ parser (>= 3.2.2.4)
42
46
  rainbow (>= 2.2.2, < 4.0)
43
47
  regexp_parser (>= 1.8, < 3.0)
44
48
  rexml (>= 3.2.5, < 4.0)
45
- rubocop-ast (>= 1.26.0, < 2.0)
49
+ rubocop-ast (>= 1.30.0, < 2.0)
46
50
  ruby-progressbar (~> 1.7)
47
51
  unicode-display_width (>= 2.4.0, < 3.0)
48
- rubocop-ast (1.27.0)
52
+ rubocop-ast (1.30.0)
49
53
  parser (>= 3.2.1.0)
50
- rubocop-performance (1.16.0)
51
- rubocop (>= 1.7.0, < 2.0)
52
- rubocop-ast (>= 0.4.0)
54
+ rubocop-performance (1.20.2)
55
+ rubocop (>= 1.48.1, < 2.0)
56
+ rubocop-ast (>= 1.30.0, < 2.0)
53
57
  ruby-progressbar (1.13.0)
54
- standard (1.25.1)
58
+ standard (1.33.0)
55
59
  language_server-protocol (~> 3.17.0.2)
56
- rubocop (= 1.48.1)
57
- rubocop-performance (= 1.16.0)
58
- unicode-display_width (2.4.2)
59
- webrick (1.7.0)
60
- yard (0.9.28)
61
- webrick (~> 1.7.0)
60
+ lint_roller (~> 1.0)
61
+ rubocop (~> 1.59.0)
62
+ standard-custom (~> 1.0.0)
63
+ standard-performance (~> 1.3)
64
+ standard-custom (1.0.2)
65
+ lint_roller (~> 1.0)
66
+ rubocop (~> 1.50)
67
+ standard-performance (1.3.1)
68
+ lint_roller (~> 1.1)
69
+ rubocop-performance (~> 1.20.2)
70
+ unicode-display_width (2.5.0)
71
+ yard (0.9.34)
62
72
  yard-doctest (0.1.17)
63
73
  minitest
64
74
  yard
@@ -70,12 +80,13 @@ PLATFORMS
70
80
  x86_64-linux
71
81
 
72
82
  DEPENDENCIES
73
- rake (~> 13.0)
83
+ racc
84
+ rake
74
85
  rake-compiler
75
- rspec (~> 3.0)
76
- standard (~> 1.3)
86
+ rspec
87
+ standard
77
88
  tiktoken_ruby!
78
- yard-doctest (~> 0.1.17)
89
+ yard-doctest
79
90
 
80
91
  BUNDLED WITH
81
92
  2.4.6
data/README.md CHANGED
@@ -1,8 +1,14 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/tiktoken_ruby.svg)](https://badge.fury.io/rb/tiktoken_ruby)
2
+
2
3
  # tiktoken_ruby
3
4
 
4
5
  [Tiktoken](https://github.com/openai/tiktoken) is BPE tokenizer from OpenAI used with their GPT models.
5
- This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
6
+ This is a wrapper around it aimed primarily at enabling accurate counts of GPT model tokens used.
7
+
8
+ ## Request for maintainers
9
+
10
+ I can't really put substantial time into maintaining this. Probably nothing more than a couple hours every few months. If you have experience maintaining ruby gems and would like to
11
+ lend a hand please send me an email or reply to this [issue](https://github.com/IAPark/tiktoken_ruby/issues/26)
6
12
 
7
13
  ## Installation
8
14
 
@@ -15,17 +21,19 @@ If bundler is not being used to manage dependencies, install the gem by executin
15
21
  $ gem install tiktoken_ruby
16
22
 
17
23
  ## Usage
24
+
18
25
  Usage should be very similar to the python library. Here's a simple example
19
26
 
20
27
  Encode and decode text
28
+
21
29
  ```ruby
22
30
  require 'tiktoken_ruby'
23
-
24
31
  enc = Tiktoken.get_encoding("cl100k_base")
25
32
  enc.decode(enc.encode("hello world")) #=> "hello world"
26
33
  ```
27
34
 
28
35
  Encoders can also be retrieved by model name
36
+
29
37
  ```ruby
30
38
  require 'tiktoken_ruby'
31
39
 
@@ -53,7 +61,6 @@ bundle exec rake compile
53
61
  bundle exec rake spec
54
62
  ```
55
63
 
56
-
57
64
  ## License
58
65
 
59
66
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Tiktoken::Encoding
4
+ CACHE_MUTEX = Mutex.new
5
+
4
6
  attr_reader :name
5
7
 
6
8
  # This returns a new Tiktoken::Encoding instance for the requested encoding
@@ -15,8 +17,10 @@ class Tiktoken::Encoding
15
17
  # @param encoding [Symbol] The name of the encoding to load
16
18
  # @return [Tiktoken::Encoding] The encoding instance
17
19
  def self.for_name_cached(encoding)
18
- @encodings ||= {}
19
- @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
20
+ CACHE_MUTEX.synchronize do
21
+ @encodings ||= {}
22
+ @encodings[encoding.to_sym] ||= Tiktoken::Encoding.for_name(encoding)
23
+ end
20
24
  end
21
25
 
22
26
  # Encodes the text as a list of integer tokens. This encoding will encode special non text tokens
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tiktoken
4
- VERSION = "0.0.6"
4
+ VERSION = "0.0.8"
5
5
  end
data/lib/tiktoken_ruby.rb CHANGED
@@ -28,22 +28,22 @@ module Tiktoken
28
28
 
29
29
  # Gets the encoding for an OpenAI model
30
30
  # @param model_name [Symbol|String] The name of the model to get the encoding for
31
- # @return [Tiktoken::Encoding] The encoding instance
31
+ # @return [Tiktoken::Encoding, nil] The encoding instance, or nil if no encoding is found
32
32
  # @example Count tokens for text
33
33
  # enc = Tiktoken.encoding_for_model("gpt-4")
34
34
  # enc.encode("hello world").length #=> 2
35
35
  def encoding_for_model(model_name)
36
- PREFIX_MODELS.each do |prefix|
37
- if model_name.to_s.start_with?("#{prefix}-")
38
- model_name = prefix
39
- break
40
- end
36
+ if MODEL_TO_ENCODING_NAME.key?(model_name.to_sym)
37
+ return get_encoding(MODEL_TO_ENCODING_NAME[model_name.to_sym])
41
38
  end
42
39
 
43
- encoding_name = MODEL_TO_ENCODING_NAME[model_name.to_sym]
44
- return nil unless encoding_name
40
+ _prefix, encoding = MODEL_PREFIX_TO_ENCODING.find do |prefix, _encoding|
41
+ model_name.start_with?(prefix.to_s)
42
+ end
45
43
 
46
- get_encoding(encoding_name)
44
+ if encoding
45
+ get_encoding(encoding)
46
+ end
47
47
  end
48
48
 
49
49
  # Lists all the encodings that are supported
@@ -67,12 +67,22 @@ module Tiktoken
67
67
  :cl100k_base
68
68
  ]
69
69
 
70
- # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L13-L53
70
+ # taken from the python library here https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
71
71
  # that is also MIT licensed but by OpenAI
72
72
  MODEL_TO_ENCODING_NAME = {
73
+ # chat
73
74
  "gpt-4": "cl100k_base",
74
75
  "gpt-3.5-turbo": "cl100k_base",
75
- # text
76
+ "gpt-35-turbo": "cl100k_base", # Azure deployment name
77
+ # base
78
+ "davinci-002": "cl100k_base",
79
+ "babbage-002": "cl100k_base",
80
+ # embeddings
81
+ "text-embedding-ada-002": "cl100k_base",
82
+ "text-embedding-3-small": "cl100k_base",
83
+ "text-embedding-3-large": "cl100k_base",
84
+ # DEPRECATED MODELS
85
+ # text (DEPRECATED)
76
86
  "text-davinci-003": "p50k_base",
77
87
  "text-davinci-002": "p50k_base",
78
88
  "text-davinci-001": "r50k_base",
@@ -83,19 +93,17 @@ module Tiktoken
83
93
  curie: "r50k_base",
84
94
  babbage: "r50k_base",
85
95
  ada: "r50k_base",
86
- # code
96
+ # code (DEPRECATED)
87
97
  "code-davinci-002": "p50k_base",
88
98
  "code-davinci-001": "p50k_base",
89
99
  "code-cushman-002": "p50k_base",
90
100
  "code-cushman-001": "p50k_base",
91
101
  "davinci-codex": "p50k_base",
92
102
  "cushman-codex": "p50k_base",
93
- # edit
103
+ # edit (DEPRECATED)
94
104
  "text-davinci-edit-001": "p50k_edit",
95
105
  "code-davinci-edit-001": "p50k_edit",
96
- # embeddings
97
- "text-embedding-ada-002": "cl100k_base",
98
- # old embeddings
106
+ # old embeddings (DEPRECATED)
99
107
  "text-similarity-davinci-001": "r50k_base",
100
108
  "text-similarity-curie-001": "r50k_base",
101
109
  "text-similarity-babbage-001": "r50k_base",
@@ -105,10 +113,21 @@ module Tiktoken
105
113
  "text-search-babbage-doc-001": "r50k_base",
106
114
  "text-search-ada-doc-001": "r50k_base",
107
115
  "code-search-babbage-code-001": "r50k_base",
108
- "code-search-ada-code-001": "r50k_base"
116
+ "code-search-ada-code-001": "r50k_base",
117
+ # open source
118
+ gpt2: "gpt2"
109
119
  }
110
120
 
111
- # these are models that have a versioned models that are otherwise identical
112
- PREFIX_MODELS = ["gpt-4", "gpt-3.5-turbo"]
121
+ MODEL_PREFIX_TO_ENCODING = {
122
+ # chat
123
+ "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
124
+ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
125
+ "gpt-35-turbo-": "cl100k_base", # Azure deployment name
126
+ # fine-tuned
127
+ "ft:gpt-4": "cl100k_base",
128
+ "ft:gpt-3.5-turbo": "cl100k_base",
129
+ "ft:davinci-002": "cl100k_base",
130
+ "ft:babbage-002": "cl100k_base"
131
+ }
113
132
  end
114
133
  end
@@ -7,12 +7,10 @@ Gem::Specification.new do |spec|
7
7
  spec.version = Tiktoken::VERSION
8
8
  spec.authors = ["IAPark"]
9
9
  spec.email = ["isaac.a.park@gmail.com"]
10
-
11
10
  spec.summary = "Ruby wrapper for Tiktoken"
12
11
  spec.description = "An unofficial Ruby wrapper for Tiktoken, " \
13
12
  "a BPE tokenizer written by and used by OpenAI. It can be used to " \
14
13
  "count the number of tokens in text before sending it to OpenAI APIs."
15
-
16
14
  spec.homepage = "https://github.com/IAPark/tiktoken_ruby"
17
15
  spec.license = "MIT"
18
16
  spec.required_ruby_version = ">= 2.7.0"
@@ -22,11 +20,6 @@ Gem::Specification.new do |spec|
22
20
  spec.metadata["homepage_uri"] = spec.homepage
23
21
  spec.metadata["source_code_uri"] = "https://github.com/IAPark/tiktoken_ruby"
24
22
  spec.metadata["documentation_uri"] = "https://rubydoc.info/github/IAPark/tiktoken_ruby/main"
25
-
26
- # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
27
-
28
- # Specify which files should be added to the gem when it is released.
29
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
30
23
  spec.files = Dir.chdir(__dir__) do
31
24
  `git ls-files -z`.split("\x0").reject do |f|
32
25
  (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
@@ -36,9 +29,5 @@ Gem::Specification.new do |spec|
36
29
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
37
30
  spec.require_paths = ["lib"]
38
31
  spec.extensions = ["ext/tiktoken_ruby/extconf.rb"]
39
-
40
- spec.add_dependency "rb_sys", "~> 0.9.68"
41
-
42
- # For more information and examples about making a new gem, check out our
43
- # guide at: https://bundler.io/guides/creating_gem.html
32
+ spec.add_dependency "rb_sys", ">= 0.9.86"
44
33
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiktoken_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - IAPark
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-09-24 00:00:00.000000000 Z
11
+ date: 2024-04-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.9.68
19
+ version: 0.9.86
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.9.68
26
+ version: 0.9.86
27
27
  description: An unofficial Ruby wrapper for Tiktoken, a BPE tokenizer written by and
28
28
  used by OpenAI. It can be used to count the number of tokens in text before sending
29
29
  it to OpenAI APIs.